### Spam Classifier
In this notebook I uses a different classifiers to classify messages as spam or not spam

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv("spam.csv",encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,class,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
x = data['SMS']

In [5]:
y = data['class']

In [6]:
X = pd.DataFrame(x)

In [7]:
y = pd.DataFrame(y)

In [8]:
vectorize = TfidfVectorizer()

In [9]:
response = vectorize.fit_transform(x.copy())

In [10]:
print(response)

  (0, 3550)	0.1481298737377147
  (0, 8030)	0.22998520738984352
  (0, 4350)	0.3264252905795869
  (0, 5920)	0.2553151503985779
  (0, 2327)	0.25279391746019725
  (0, 1303)	0.24415547176756056
  (0, 5537)	0.15618023117358304
  (0, 4087)	0.10720385321563428
  (0, 1751)	0.2757654045621182
  (0, 3634)	0.1803175103691124
  (0, 8489)	0.22080132794235655
  (0, 4476)	0.2757654045621182
  (0, 1749)	0.3116082237740733
  (0, 2048)	0.2757654045621182
  (0, 7645)	0.15566431601878158
  (0, 3594)	0.15318864840197105
  (0, 1069)	0.3264252905795869
  (0, 8267)	0.18238655630689804
  (1, 5504)	0.27211951321382544
  (1, 4512)	0.4082988561907181
  (1, 4318)	0.5236458071582338
  (1, 8392)	0.4316010362639011
  (1, 5533)	0.5465881710238072
  (2, 4087)	0.07917128722158312
  (2, 3358)	0.11301399735581102
  :	:
  (5570, 4218)	0.12246610191126918
  (5570, 8313)	0.18723687600522523
  (5570, 1084)	0.11225268140936363
  (5570, 4615)	0.1596552981734164
  (5570, 7039)	0.18426763178390446
  (5570, 3308)	0.1217217261863451

In [11]:
s = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=7)

In [12]:
s.get_n_splits(response,y)
y = y.values.reshape(5572)

In [13]:
for train_index, test_index in s.split(response, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = response[train_index], response[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [ 127 3513 1862 ... 3899 5436 1588] TEST: [  48 3800 2122 ... 1768 2445 4229]
TRAIN: [4408 4690 2151 ...  888 3872 3489] TEST: [ 420 3985 4404 ... 2771   39 3145]
TRAIN: [3558 4085 1547 ... 3728 3605 1650] TEST: [ 592  804 3755 ... 2947 4215 1584]


### SVC

In [14]:
model = SVC(gamma=1)

In [15]:
model.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
pred = model.predict(x_test)

In [17]:
A=accuracy_score(pred,y_test)*100

In [18]:
print("Accuracy Score:")
print(A)
print("Classification Report:\n")
print(classification_report(y_test,pred))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred))

Accuracy Score:
98.14593301435407
Classification Report:

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99      1448
       spam       0.99      0.87      0.93       224

avg / total       0.98      0.98      0.98      1672

Confusion Matrix:

[[1446    2]
 [  29  195]]


### GussianNB

In [19]:
model2 = GaussianNB()

In [20]:
model2.fit(x_train.todense(),y_train)

GaussianNB(priors=None)

In [21]:
pred2 = model2.predict(x_test.todense())

In [22]:
B = accuracy_score(pred2,y_test)*100

In [23]:
print("Accuracy Score:")
print(B)
print("Classification Report:\n")
print(classification_report(y_test,pred2))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred2))

Accuracy Score:
89.29425837320574
Classification Report:

             precision    recall  f1-score   support

        ham       0.98      0.89      0.94      1448
       spam       0.56      0.90      0.69       224

avg / total       0.93      0.89      0.90      1672

Confusion Matrix:

[[1292  156]
 [  23  201]]


### MultinomialNB

In [24]:
model3 = MultinomialNB(alpha=0.1)

In [25]:
model3.fit(x_train.todense(),y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [26]:
pred3 = model3.predict(x_test.todense())

In [27]:
C = accuracy_score(pred3,y_test)*100

In [28]:
print("Accuracy Score:")
print(C)
print("Classification Report:\n")
print(classification_report(y_test,pred3))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred3))

Accuracy Score:
98.6244019138756
Classification Report:

             precision    recall  f1-score   support

        ham       0.99      0.99      0.99      1448
       spam       0.95      0.94      0.95       224

avg / total       0.99      0.99      0.99      1672

Confusion Matrix:

[[1438   10]
 [  13  211]]


### KNN

In [29]:
model4 = KNeighborsClassifier(algorithm='brute',n_neighbors=35)

In [30]:
model4.fit(x_train.todense(),y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=35, p=2,
           weights='uniform')

In [31]:
pred4 = model4.predict(x_test)

In [32]:
D = accuracy_score(pred4,y_test)*100

In [33]:
print("Accuracy Score:")
print(D)
print("Classification Report:\n")
print(classification_report(y_test,pred4))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred4))

Accuracy Score:
95.75358851674642
Classification Report:

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98      1448
       spam       0.99      0.69      0.81       224

avg / total       0.96      0.96      0.95      1672

Confusion Matrix:

[[1446    2]
 [  69  155]]


### Decision Tree

In [34]:
model5 = DecisionTreeClassifier()

In [35]:
model5.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [36]:
pred5 = model5.predict(x_test)

In [37]:
E = accuracy_score(y_test,pred5)*100

In [38]:
print("Accuracy Score:")
print(E)
print("Classification Report:\n")
print(classification_report(y_test,pred5))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred5))

Accuracy Score:
97.30861244019138
Classification Report:

             precision    recall  f1-score   support

        ham       0.99      0.98      0.98      1448
       spam       0.89      0.91      0.90       224

avg / total       0.97      0.97      0.97      1672

Confusion Matrix:

[[1424   24]
 [  21  203]]


### Random forest

In [39]:
model6 =RandomForestClassifier()

In [40]:
model6.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
pred6 = model6.predict(x_test)

In [42]:
F = accuracy_score(y_test,pred6)*100

In [43]:

print("Accuracy Score:")
print(F)
print("Classification Report:\n")
print(classification_report(y_test,pred6))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred6))


Accuracy Score:
97.188995215311
Classification Report:

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1448
       spam       0.99      0.79      0.88       224

avg / total       0.97      0.97      0.97      1672

Confusion Matrix:

[[1447    1]
 [  46  178]]


### Logistic Regression

In [44]:
model7 = LogisticRegression()

In [45]:
model7.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [46]:
pred7 = model7.predict(x_test)

In [47]:
G = accuracy_score(y_test,pred6)*100

In [48]:
print("Accuracy Score:")
print(G)
print("Classification Report:\n")
print(classification_report(y_test,pred6))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pred6))

Accuracy Score:
97.188995215311
Classification Report:

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1448
       spam       0.99      0.79      0.88       224

avg / total       0.97      0.97      0.97      1672

Confusion Matrix:

[[1447    1]
 [  46  178]]


### Voting Classifier

In [49]:
estimater = []
LR = LogisticRegression()
estimater.append(("Logistic",LR))
KM = KMeans()
estimater.append(("KMeans",KM))
SV = SVC(kernel="rbf")
estimater.append(("SVC",SV))
GN = GaussianNB()
estimater.append(("Gaussian",GN))
KN =  KNeighborsClassifier()
estimater.append(("KNN",KN))

In [50]:
Voting = VotingClassifier(estimater)

In [58]:
Voting.fit(x_train.todense(),y_train)

VotingClassifier(estimators=[('Logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('KMeans...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [52]:
pre_Voting = Voting.predict(x_test.todense())

  if diff:


In [53]:
H = accuracy_score(y_test,pre_Voting)*100

In [54]:
print("Accuracy Score:")
print(H)
print("Classification Report:\n")
print(classification_report(y_test,pre_Voting))
print("Confusion Matrix:\n")
print(confusion_matrix(y_test,pre_Voting))

Accuracy Score:
90.84928229665071
Classification Report:

             precision    recall  f1-score   support

        ham       0.90      1.00      0.95      1448
       spam       1.00      0.32      0.48       224

avg / total       0.92      0.91      0.89      1672

Confusion Matrix:

[[1448    0]
 [ 153   71]]


### Comparision

In [55]:
Models= ['SVC','GussianNB','MultinomialNB','KNN','Decision Tree','Random forest','Logistic Regression','Voting Classifier']
res =[A,B,C,D,E,F,G,H]

In [56]:
results = pd.DataFrame({"Model":Models,"Accuracy":res},index=[1,2,3,4,5,6,7,8])

In [57]:
results

Unnamed: 0,Accuracy,Model
1,98.145933,SVC
2,89.294258,GussianNB
3,98.624402,MultinomialNB
4,95.753589,KNN
5,97.308612,Decision Tree
6,97.188995,Random forest
7,97.188995,Logistic Regression
8,90.849282,Voting Classifier


 From the above table we conclude that  MultinomialNB gives the better accuracy