In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('spam.csv',encoding = "ISO-8859-1")

In [3]:
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis = 1, inplace = True)

In [6]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['v2'],df['v1'], test_size=0.2, random_state=0)

In [8]:
vec = CountVectorizer()
X_train_vec = vec.fit_transform(X_train)

In [9]:
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

MultinomialNB()

In [10]:
X_test_vec = vec.transform(X_test)

In [11]:
y_pred = clf.predict(X_test_vec)
accuracy_score(y_test, y_pred)

0.9874439461883409

In [12]:
acc_lst = []
for i in range(0,5):
    X_train, X_test, y_train, y_test = train_test_split(df['v2'],df['v1'], test_size=0.2)
    vec = CountVectorizer()
    X_train_vec = vec.fit_transform(X_train)
    clf = MultinomialNB()
    clf.fit(X_train_vec, y_train)
    X_test_vec = vec.transform(X_test)
    y_pred = clf.predict(X_test_vec)
    acc_lst.append(accuracy_score(y_test, y_pred))
print(acc_lst)

[0.9883408071748879, 0.9910313901345291, 0.9865470852017937, 0.9829596412556054, 0.9883408071748879]


In [13]:
final_acc = sum(acc_lst)/len(acc_lst)

In [14]:
final_acc

0.9874439461883406

In [16]:
vec = CountVectorizer()

In [17]:
clf = MultinomialNB()

In [23]:
pipe = Pipeline(steps=[('vec', vec), ('classification', clf)])

In [24]:
param_grid = {
    'classification__alpha': [1, 2, 3, 4, 5]
}

In [25]:
search = GridSearchCV(pipe, param_grid, n_jobs=-1)
search.fit(df['v2'],df['v1'])

GridSearchCV(estimator=Pipeline(steps=[('vec', CountVectorizer()),
                                       ('classification', MultinomialNB())]),
             n_jobs=-1, param_grid={'classification__alpha': [1, 2, 3, 4, 5]})

In [26]:
search.best_estimator_

Pipeline(steps=[('vec', CountVectorizer()),
                ('classification', MultinomialNB(alpha=1))])

In [27]:
results = search.cv_results_
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classification__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.38914,0.054631,0.068903,0.008187,1,{'classification__alpha': 1},0.986547,0.986547,0.98474,0.982944,0.985637,0.985283,0.001348,1
1,0.346652,0.016571,0.071419,0.008639,2,{'classification__alpha': 2},0.987444,0.981166,0.978456,0.981149,0.982944,0.982232,0.002975,2
2,0.309671,0.029361,0.082096,0.015503,3,{'classification__alpha': 3},0.986547,0.977578,0.973968,0.977558,0.981149,0.97936,0.004251,3
3,0.371669,0.032636,0.07339,0.010645,4,{'classification__alpha': 4},0.979372,0.973991,0.970377,0.975763,0.978456,0.975592,0.003234,4
4,0.31836,0.025695,0.065176,0.015414,5,{'classification__alpha': 5},0.973991,0.96861,0.968582,0.972172,0.976661,0.972003,0.003127,5
