### Spam Classifier

In [43]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

### label and message are columns separated by a tab space. Spam/ham is the label

In [44]:
messages = pd.read_csv('/home/shrinidhi/NLTK-learning/smsspamcollection/SMSSpamCollection',
                       sep='\t', names=['label', 'message'])


In [45]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
def stemming():
    # Stemming
    ps = PorterStemmer()
    corpus = []
    for i in range(0, len(messages)):
        review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
stemming()

In [47]:
# creating the bag of words model
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [48]:
# making ham/spam represented in terms of 0/1
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values

In [49]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [50]:
# training model using Naive Bayes classifier
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [51]:
y_pred = spam_detect_model.predict(X_test)

In [52]:
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [53]:
# Confusion matrix
confusion_m = confusion_matrix(y_test, y_pred)

In [54]:
confusion_m

array([[946,   9],
       [  8, 152]])

In [55]:
accuracy = accuracy_score(y_test, y_pred)

In [56]:
accuracy

0.9847533632286996

In [57]:
def lemmatization():
    # Lemmatization
    lm = WordNetLemmatizer()
    corpus = []
    for i in range(0, len(messages)):
        review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
        review = review.lower()
        review = review.split()
        review = [lm.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
lemmatization()

In [58]:
# creating the bag of words model
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [59]:
# making ham/spam represented in terms of 0/1
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values

In [60]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [73]:
# training model using Naive Bayes classifier
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [74]:
y_pred = spam_detect_model.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [63]:
# Confusion matrix
confusion_m = confusion_matrix(y_test, y_pred)
confusion_m

array([[946,   9],
       [  8, 152]])

In [64]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9847533632286996