# GMAIL SPAM DETECTION
### -SHREYA SHUKLA


In [102]:
# import libraries
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import stopwords


In [103]:
#load the data
df=pd.read_csv(r"C:\Users\Hp\Downloads\SPAM-210331-134237.csv")

In [104]:
#print the first 5 rows of the data
df.head(5)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [105]:
#get number of rows and columns in the data
df.shape

(116, 2)

In [106]:
df['spam']=df['type'].map({'spam':1,'ham':0}).astype(int)

In [107]:
df.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [108]:
t=len(df['type'])
print("No of rows in review column:",t)
t=len(df['text'])
print("No of rows in Liked column:",t)

No of rows in review column: 116
No of rows in Liked column: 116


In [109]:
#check for duplicates and remove them
df.drop_duplicates(inplace=True)

In [110]:
#show the new shape
df.shape

(115, 3)

In [111]:
#Check for missing data(Data Cleaning)
df.isnull().sum()

type    0
text    0
spam    0
dtype: int64

In [112]:
#TOKENIZATION
df['text'][1]


'Ok lar... Joking wif u oni...'

In [113]:
def tokenizer(text):
    return text.split()

In [114]:
df['text']=df['text'].apply(tokenizer)

In [115]:
df['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [116]:
#STEMMING
from nltk.stem.snowball import SnowballStemmer
porter=SnowballStemmer('english',ignore_stopwords=False)

In [117]:
def stem_it(text):
    return[porter.stem(word) for word in text]

In [118]:
df['text']=df['text'].apply(stem_it)

In [119]:
df['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [120]:
#LEMMATIZATION
df['text'][115]

['wa,', 'ur', 'openin', 'sentenc', 'veri', 'for']

In [121]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [122]:
def lemmit_it(text):
    return[lemmatizer.lemmatize(word,pos="a") for word in text]

In [123]:
df['text']=df['text'].apply(lemmit_it)

In [124]:
df['text'][115]

['wa,', 'ur', 'openin', 'sentenc', 'veri', 'for']

In [125]:
from nltk.corpus import stopwords
stop_words=stopwords.words("english")

In [128]:
def process_text(text):
    review=[word for word in text if not word in stop_words]
    return review
    

In [130]:
df['text']=df['text'].apply(process_text)

In [133]:
df['text'][115]

'w,urpennenencverfr'

In [134]:
df['text']=df['text'].apply(''.join)

In [135]:
df.head()

Unnamed: 0,type,text,spam
0,ham,"gunljurngpn,crz..vlnlnbugngrewrllebuffe...cneh...",0
1,ham,klr...jkewfun...,0
2,spam,freeenrn2wklcpwnfcupfnlk212005.exf87121receven...,1
3,ham,uunerlhr...uclrehen...,0
4,ham,"nhn'hnkhegeuf,helverunherehugh",0


In [137]:
#VECTORIZATIOM
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y=df.spam.values
x=tfidf.fit_transform(df['text'])

In [138]:
from sklearn.feature_extraction.text import CountVectorizer
message=CountVectorizer(analyzer=process_text).fit_transform(df['text'])

In [139]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1,shuffle=False)

In [140]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(message,df['spam'],test_size=0.2,random_state=1)

In [141]:
message.shape

(115, 52)

### CLASSIFICATION - NAIVE BAYES


In [143]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB().fit(X_train, y_train)

In [144]:
print(classifier.predict(X_train))

[0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0]


In [145]:
print(y_train.values)

[1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0]


In [146]:
#evaluate the model on training dataset
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
pred=classifier.predict(X_train)


In [147]:
print(classification_report(y_train,pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        81
           1       1.00      0.73      0.84        11

    accuracy                           0.97        92
   macro avg       0.98      0.86      0.91        92
weighted avg       0.97      0.97      0.97        92



In [148]:
print(confusion_matrix(y_train,pred))

[[81  0]
 [ 3  8]]


In [167]:
print("Accuracy using Naive Bayes algorithm is",accuracy_score(y_train,pred)*100)

Accuracy using Naive Bayes algorithm is 96.73913043478261


### CLASSIFICATION -LOGISTIC REGRESSION


In [162]:
from sklearn.linear_model import LogisticRegression as LR
classifier1=LR().fit(X_train,y_train)
y_pred=classifier1.predict(X_test)
print(y_pred)

[0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 1]


In [165]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,y_test)*100
print("Accuracy using logistic regression is=", accuracy)

Accuracy using logistic regression is= 95.65217391304348


### CLASSIFICATION -SVC


In [189]:
from sklearn.svm import SVC 

classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)



SVC(random_state=0)

In [190]:
print("Accuracy using SVC is", classifier.score(X_test,y_test)*100)

Accuracy using SVC is 100.0


### CLASSIFICATION- KNN

In [191]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
# Predicting results using Test data set
pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
acc=accuracy_score(pred,y_test)*100
print("Accuracy using KNN model=",acc)

Accuracy using KNN model= 91.30434782608695


### The following analysis can be made from the above code for Gmail Spam Detection:
#### Support Vector Machine is the best suited model for Gmail Spam Detection as it gives 100% accuracy
#### The accuracy for other algorithms is as follows: Naive Bayes Algorithm - 96.73%, Logistic Regression - 95.65% , K-Nearest Neighbours - 91.31%