In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [2]:
df = pd.read_csv("C:\Codes\Practice\ML\SMS Spam prediction\SMSSpamCollection", sep='\t', header=None, names=['label', 'msg'])
df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
df.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df['msg'])
y = df['label']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y , random_state=104,  test_size=0.25,  shuffle=True, stratify=y) 

In [6]:
nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)

y_pred_nb = nb_model.predict(x_test)

print('Navie Bayes Accurancy : ', accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Navie Bayes Accurancy :  0.9583632447954056
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1206
           1       1.00      0.69      0.82       187

    accuracy                           0.96      1393
   macro avg       0.98      0.84      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [7]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

y_pred_lr = lr_model.predict(x_test)

print('Logistic Regression Accurancy : ', accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accurancy :  0.95908111988514
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1206
           1       0.99      0.70      0.82       187

    accuracy                           0.96      1393
   macro avg       0.97      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [8]:
nb_cv_score = cross_val_score(nb_model, x, y, cv=5)
lr_cv_score = cross_val_score(lr_model, x, y, cv=5)

print('Naive Bayes CV Accuracy : ', nb_cv_score.mean())
print('Logistic Regression CV Accuracy : ', lr_cv_score.mean())

Naive Bayes CV Accuracy :  0.9589013855455635
Logistic Regression CV Accuracy :  0.9610548180112872


In [9]:
param_grid_nb = {'alpha': [0.1, 1.0, 10.0]}
grid_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5)
grid_nb.fit(x_train, y_train)

print('Best Naive Bayes alpha : ', grid_nb.best_params_)
best_nb_model = grid_nb.best_estimator_
y_pred_best_nb = best_nb_model.predict(x_test)
print('Tuned Naive Bayes Accuracy : ', accuracy_score(y_test, y_pred_best_nb))

Best Naive Bayes alpha :  {'alpha': 0.1}
Tuned Naive Bayes Accuracy :  0.9849246231155779


In [10]:
param_grid_lr = {'C': [0.1, 1.0, 10.0]}
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5)
grid_lr.fit(x_train, y_train)

print('Best Logistic Regression C : ', grid_lr.best_params_)
best_lr_model = grid_lr.best_estimator_
y_pred_best_lr = best_lr_model.predict(x_test)
print('Tuned Naive Bayes Accuracy : ', accuracy_score(y_test, y_pred_best_lr))

Best Logistic Regression C :  {'C': 10.0}
Tuned Naive Bayes Accuracy :  0.9741564967695621
