In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas as pd
import sklearn

import numpy as np
from sklearn import feature_extraction
from sklearn import naive_bayes , model_selection , metrics , svm

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore") 

from sklearn.linear_model import LogisticRegression


In [2]:
data=pd.read_csv("/home/pratiksha/data/SMSSpamCollection", delimiter='\t', names=['names','sms'])
data.head()
#cleaning and labelling data

Unnamed: 0,names,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
f=feature_extraction.text.CountVectorizer(stop_words='english')  #convert to vector
X=f.fit_transform(data['sms'])   
np.shape(X)            #Learn the vocabulary dictionary and return size of the term-document matrix.

(5572, 8444)

In [4]:
data['names']=data['names'].map({'spam':1,'ham':0})
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,data['names'],test_size=.2, random_state=42)
print([np.shape(X_train), np.shape(X_test)])

# splits the dataset into train and test set

[(4457, 8444), (1115, 8444)]


NAIVE BAYES CLASSIFIER: t is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. Here we use word frquencies to decide wether an sms is spam or ham.

In [5]:
list_alpha=np.arange(1/100000,20,0.11)
score_train=np.zeros(len(list_alpha))
score_test=np.zeros(len(list_alpha))
recall_test=np.zeros(len(list_alpha))
precision_test=np.zeros(len(list_alpha))
count=0
for alpha in list_alpha:
    bayes = naive_bayes.MultinomialNB(alpha=alpha)
    bayes.fit(X_train,y_train)
    score_train[count]=bayes.score(X_train,y_train)
    score_test[count]=bayes.score(X_test,y_test)
    recall_test[count]=metrics.recall_score(y_test,bayes.predict(X_test))
    precision_test[count]=metrics.precision_score(y_test, bayes.predict(X_test))
    count=count+1

In [6]:
matrix= np.matrix(np.c_[list_alpha, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data=matrix, columns=['alpha','Train accuracy','Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(10)

matrix= np.matrix(np.c_[list_alpha, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data=matrix, columns=['alpha','Train accuracy','Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(10)



Unnamed: 0,alpha,Train accuracy,Test Accuracy,Test Recall,Test Precision
0,1e-05,0.997308,0.979372,0.919463,0.925676
1,0.11001,0.996635,0.984753,0.966443,0.923077
2,0.22001,0.99641,0.981166,0.966443,0.9
3,0.33001,0.995961,0.979372,0.966443,0.888889
4,0.44001,0.995513,0.980269,0.966443,0.89441
5,0.55001,0.99484,0.980269,0.959732,0.899371
6,0.66001,0.99484,0.981166,0.959732,0.905063
7,0.77001,0.99484,0.982063,0.959732,0.910828
8,0.88001,0.994615,0.982063,0.959732,0.910828
9,0.99001,0.993942,0.98296,0.959732,0.916667


In [7]:
best_index=models['Test Precision'].idxmax()
models.iloc[best_index, :]



alpha             5.720010
Train accuracy    0.986762
Test Accuracy     0.990135
Test Recall       0.926174
Test Precision    1.000000
Name: 52, dtype: float64

For alpha = 5.7720010 test precision is 1

In [8]:
models[models['Test Precision']==1].head(5)

Unnamed: 0,alpha,Train accuracy,Test Accuracy,Test Recall,Test Precision
52,5.72001,0.986762,0.990135,0.926174,1.0
53,5.83001,0.985865,0.990135,0.926174,1.0
54,5.94001,0.985865,0.990135,0.926174,1.0
55,6.05001,0.985865,0.990135,0.926174,1.0
56,6.16001,0.985641,0.990135,0.926174,1.0


In [9]:
best_index=models[models['Test Precision']==1]['Test Accuracy'].idxmax()
bayes= naive_bayes.MultinomialNB(alpha=list_alpha[best_index])
bayes.fit(X_train,y_train)
models.iloc[best_index,:]



alpha             5.720010
Train accuracy    0.986762
Test Accuracy     0.990135
Test Recall       0.926174
Test Precision    1.000000
Name: 52, dtype: float64

In [10]:
m_confusion_test=metrics.confusion_matrix(y_test,bayes.predict(X_test))
pd.DataFrame(data=m_confusion_test, columns=['Predicted ham', 'Predicted spam'],index=['Actual ham', 'Actual spam'])



Unnamed: 0,Predicted ham,Predicted spam
Actual ham,966,0
Actual spam,11,138


SUPPORT VECTOR CLASSIFIER: Each sms of is treated as a support vector and the correct hyperplane separating the support vectors into ham and spam is found with different values of the penalty parameter c of the error term out of which one is chosen for which the test precision is 1.

In [11]:
list_C=np.arange(500,2000,100)
score_train=np.zeros(len(list_C))
score_test=np.zeros(len(list_C))
recall_test=np.zeros(len(list_C))
precision_test=np.zeros(len(list_C))
count=0
for C in list_C:
    svc=svm.SVC(C=C)
    svc.fit(X_train,y_train)
    score_train[count]=svc.score(X_train,y_train)
    score_test[count]=svc.score(X_test,y_test)
    recall_test[count]=metrics.recall_score(y_test,svc.predict(X_test))
    precision_test[count]=metrics.precision_score(y_test, svc.predict(X_test))
    count=count+1



In [12]:
matrix= np.matrix(np.c_[list_C, score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data=matrix, columns=['alpha','Train accuracy','Test Accuracy', 'Test Recall', 'Test Precision'])
models.head(10)



Unnamed: 0,alpha,Train accuracy,Test Accuracy,Test Recall,Test Precision
0,500.0,0.994391,0.986547,0.899329,1.0
1,600.0,0.99484,0.987444,0.90604,1.0
2,700.0,0.99641,0.987444,0.90604,1.0
3,800.0,0.997532,0.987444,0.90604,1.0
4,900.0,0.997756,0.987444,0.90604,1.0
5,1000.0,0.998429,0.987444,0.90604,1.0
6,1100.0,0.998654,0.987444,0.90604,1.0
7,1200.0,0.998654,0.986547,0.899329,1.0
8,1300.0,0.998878,0.986547,0.899329,1.0
9,1400.0,0.999551,0.986547,0.899329,1.0


In [13]:
best_index=models['Test Precision'].idxmax()
models.iloc[best_index, :]

alpha             500.000000
Train accuracy      0.994391
Test Accuracy       0.986547
Test Recall         0.899329
Test Precision      1.000000
Name: 0, dtype: float64

In [14]:
models[models['Test Precision']==1].head(5)



Unnamed: 0,alpha,Train accuracy,Test Accuracy,Test Recall,Test Precision
0,500.0,0.994391,0.986547,0.899329,1.0
1,600.0,0.99484,0.987444,0.90604,1.0
2,700.0,0.99641,0.987444,0.90604,1.0
3,800.0,0.997532,0.987444,0.90604,1.0
4,900.0,0.997756,0.987444,0.90604,1.0


In [15]:
best_index=models[models['Test Precision']==1]['Test Accuracy'].idxmax()
svc= svm.SVC(C=list_C[best_index])
svc.fit(X_train,y_train)
models.iloc[best_index,:]



alpha             600.000000
Train accuracy      0.994840
Test Accuracy       0.987444
Test Recall         0.906040
Test Precision      1.000000
Name: 1, dtype: float64

In [16]:
m_confusion_test=metrics.confusion_matrix(y_test,svc.predict(X_test))
pd.DataFrame(data=m_confusion_test, columns=['Predicted ham', 'Predicted spam'],index=['Actual ham', 'Actual spam'])


Unnamed: 0,Predicted ham,Predicted spam
Actual ham,966,0
Actual spam,14,135


LOGISTIC REGRESSION: It measures the relationship between variales by estimating proailities using logistic function. The probabilities are transformed into binary values to make a prediction and the logistic function used in this case is the sigmoid fuction.

In [18]:
Spam_model = LogisticRegression(solver='liblinear',penalty='l1')
Spam_model.fit(X_train,y_train)
score_train = Spam_model.score(X_train,y_train)
score_test = Spam_model.score(X_test,y_test)
recall_test = metrics.recall_score(y_test,Spam_model.predict(X_test))
precision_test = metrics.precision_score(y_test, Spam_model.predict(X_test))
matrix = np.matrix(np.c_[score_train, score_test ,recall_test , precision_test])
models = pd.DataFrame(data=matrix, columns=['train accuracy', 'test accuracy', 'test recall', 'test precision'])
models.head()

Unnamed: 0,train accuracy,test accuracy,test recall,test precision
0,0.990128,0.984753,0.892617,0.992537
