In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier


In [31]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [32]:
df = df.loc[:,'v1':'v2']

In [33]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
df = df[['v2','v1']]


In [35]:
df['v1'] = df['v1'].map({'ham': 0, 'spam':1})

In [36]:
df.columns = ['sms','spam']

In [37]:
df.head()

Unnamed: 0,sms,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [38]:
df.spam.value_counts(normalize=True)

0    0.865937
1    0.134063
Name: spam, dtype: float64

In [39]:
df.spam.value_counts()

0    4825
1     747
Name: spam, dtype: int64

In [40]:
len(df)

5572

In [41]:
df.isnull().any()

sms     False
spam    False
dtype: bool

In [None]:
#lowering casing the words

In [84]:
df['sms'] = df.sms.str.lower()

In [85]:
df.sms.head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: sms, dtype: object

In [86]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.sms)
y = df.spam

In [94]:
cross_val_score(MultinomialNB(), X, y, cv=10).mean()

0.9811554104142411

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.30)

In [88]:
clf = MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [89]:
print (clf.score(X_test,y_test))
print (clf.score(X_train, y_train))

0.9802631578947368
0.9938461538461538


In [90]:
y_pred = clf.predict(X_test)
print (classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1464
          1       0.91      0.93      0.92       208

avg / total       0.98      0.98      0.98      1672



In [91]:
print (confusion_matrix(y_test, y_pred))

[[1446   18]
 [  15  193]]


In [108]:
alphas = [0.1, 1, 5, 10, 50]
min_dfs = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2]

best_alpha = None
best_min_df = None
max_score = -np.inf

for alpha in alphas:
    for min_df in min_dfs:
        vectorizer = CountVectorizer(min_df=min_df)
        X = vectorizer.fit_transform(df.sms)
        y = df.spam
        X_train, X_test, y_train, y_test = train_test_split(X,y)
        
        clf = MultinomialNB(alpha=alpha)
        cv_score = cross_val_score(clf,X_train, y_train, cv=10).mean()
        
        if cv_score > max_score:
            max_score = cv_score
            best_alpha, best_min_df = alpha, min_df

In [109]:
print ('alpha: {}'.format(best_alpha))
print ('best min df: {}'.format(best_min_df))

alpha: 0.1
best min df: 0.001


In [133]:
vec = CountVectorizer(min_df=0.001)
X1 = vectorizer.fit_transform(df.sms)
y1 = df.spam

X_train, X_test, y_train, y_test = train_test_split(X1, y1)

clf = MultinomialNB(alpha=0.1)
cv_score = cross_val_score(clf, X_train, y_train, cv=10).mean()

clf.fit(X_train,y_train)
print(clf.score(X_train,y_train))
y_pred = clf.predict(X_test)

print(clf.score(X_test, y_test))

print (cv_score)

print (confusion_matrix(y_test, y_pred))
print (classification_report(y_test, y_pred))

0.973917205072984
0.9712849964106246
0.9715224100310026
[[1189   20]
 [  20  164]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98      1209
          1       0.89      0.89      0.89       184

avg / total       0.97      0.97      0.97      1393



In [131]:
new_test = vectorizer.transform(["Go until jurong point, crazy.. Available only ... "])
clf.predict(new_test)

array([0])

In [132]:
clf.predict_proba(new_test)

array([[0.85450254, 0.14549746]])