# Import all the required packages:

In [143]:
import os
os.getcwd()
import pandas as pd

In [144]:
from sklearn import svm

In [145]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

## Use pandas module to read a dataframe:

In [146]:
df=pd.read_csv("spam.csv")  # Dataset has 1,002 SMS ham messages and 322 spam messages 
print(df.head()) #to print  few starting lines of the dataframe

  Label                                          EmailText
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


## To get some statitical data about the dataframe: 

In [147]:
print(df.describe())

       Label               EmailText
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


In [148]:
df.keys()      #column index

Index(['Label', 'EmailText'], dtype='object')

In [149]:
df.shape #To find the number of rows and columns of the dataframe

(5572, 2)

In [150]:
df.columns  #Get the column names in the data set.

Index(['Label', 'EmailText'], dtype='object')

In [151]:
df.drop_duplicates(inplace = True)   ##Checking for duplicates and removing them
df.shape  #Show the new shape (number of rows & columns)

(5169, 2)

In [152]:
import sys
print(sys.executable)

c:\python38-32\python.exe


In [153]:
df.columns

Index(['Label', 'EmailText'], dtype='object')

## According to general rule of thumb we can split a dataset into 60-training set20-crossvalidation set , 20- test set.
## But here the classification on with 80-Training set , 20-Test set

In [154]:
x=df["EmailText"]
y=df["Label"]
x_tr,y_tr=x[0:4457] ,y[0:4457] #initializing first 4457 rows to traing set
x_test,y_test=x[4457:],y[4457:]  #initializing the remaining rows to test set according to rule


In [155]:
x_tr.shape,y_tr.shape  #dimensions of training set

((4457,), (4457,))

In [156]:
x_test.shape,y_test.shape         #dimensions of test  set

((712,), (712,))

In [157]:
df.isnull().sum()  #Show the number of missing (NAN, NaN, na) data for each column

Label        0
EmailText    0
dtype: int64

## Extracting features

### The machine learning models require statistical data rather these are strings.So as to represent these strings in numbers usinf feature extraction.

In [158]:
C= CountVectorizer()  #to count the number of occurances of a particular word in the email text

In [159]:
features=C.fit_transform(x_tr)   #features are extracted

In [160]:
features.shape

(4457, 8028)

In [161]:
from sklearn.naive_bayes import MultinomialNB
#Multinomial Naive Bayes classifier which is suitable for classification 
#with discrete features (e.g., word counts for text classification)
classifier = MultinomialNB()
#Evaluate the model on the training data set
classifier.fit(features, y_tr)
classifier.predict(features)
print(classifier.predict(features)) #Print the predictions

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


## Building model:

In [138]:
tunedparam={'kernel':['linear','rbf'],'gamma':[1e-3,1e-4],'C':[1,10,100,1000]}#to find better parameters of our model

In [141]:
ml=GridSearchCV(svm.SVC(),tunedparam)
t=ml.fit(features,y_tr)  #fitting data using svm model
print(ml.best_params_) #printing best parametes of our model

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}


## Test accuracy:

In [162]:
features_test=C.transform(x_test)
print(ml.score(features_test,y_test))

0.9845505617977528


# Linear logistic vs SVM:

In [25]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [26]:
msg= pd.read_csv("spam.csv",encoding = "latin")
msg.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
message_data = msg.rename(columns = {'Label':'Spam/Not_Spam','EmailText':'message'})

In [28]:
message_data.describe()

Unnamed: 0,Spam/Not_Spam,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [29]:
copy = message_data['message'].copy()

In [30]:
def preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.isalpha()]
    return " ".join(text)

In [31]:
copy = copy.apply(preprocess)

In [32]:
copy


0       Go until jurong point crazy Available only in ...
1                                 Ok lar Joking wif u oni
2       Free entry in a wkly comp to win FA Cup final ...
3             U dun say so early hor U c already then say
4       Nah I dont think he goes to usf he lives aroun...
                              ...                        
5567    This is the time we have tried contact u U hav...
5568                    Will b going to esplanade fr home
5569    Pity was in mood for that Soany other suggestions
5570    The guy did some bitching but I acted like id ...
5571                            Rofl Its true to its name
Name: message, Length: 5572, dtype: object

In [33]:
vect = TfidfVectorizer()

In [34]:
message = vect.fit_transform(copy)
message


<5572x8170 sparse matrix of type '<class 'numpy.float64'>'
	with 69318 stored elements in Compressed Sparse Row format>

In [121]:
x_train,x_test,y_train,y_test= train_test_split(message,message_data['Spam/Not_Spam'], test_size=0.3, random_state=20)

In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Spam= LogisticRegression()
Spam.fit(x_train, y_train)
pred = Spam.predict(x_test)
accuracy_score(y_test,pred)


0.9563397129186603

In [123]:
from sklearn import metrics
confusion = metrics.confusion_matrix(y_test, pred)
print(confusion) #A confusion matrix is a table that is often used to describe 
                #the performance of a classification model (or "classifier")
                #on a set of test data for which the true values are known. 


[[1437    2]
 [  71  162]]


In [124]:
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [125]:
print((TP + TN) / float(TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, pred))  #Accuracy is the number of correctly predicted data points 
                                            #out of all the data points.

0.9563397129186603
0.9563397129186603


In [128]:
precision = TP / float(TP + FP)

print(precision)   # fraction of relevant instances among the retrieved instances
print(metrics.precision_score(y_test, pred,average="binary",pos_label="spam"))

0.9878048780487805
0.9878048780487805


In [127]:
from sklearn.metrics import recall_score
recall = TP / float(FN + TP)
print(recall)
print(metrics.recall_score(y_test, pred,average="binary",pos_label="spam"))

0.6952789699570815
0.6952789699570815


In [135]:
from sklearn.metrics import f1_score
F = (2*precision*recall)/(precision+recall)
print(F)
print(metrics.f1_score(y_test, pred,average="binary",pos_label="spam"))


0.8161209068010076
0.8161209068010076
