In [1]:
#Data Handling
import pandas as pd
import numpy as np

# Visialization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# For Text processing 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# from sklearn.model_selection import GridSearchCV


#Accuracy Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Reading Data
df = pd.read_csv('spam_ham_dataset.csv')
df.drop('Unnamed: 0', axis=1, inplace = True)
# Changing column names
df.columns = ['label', 'text', 'class']
df.head()

Unnamed: 0,label,text,class
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
df['class'].value_counts()

0    3672
1    1499
Name: class, dtype: int64

In [4]:
spam_df = df.loc[df['class']==0]
ham_df = df.loc[df['class']==1]
spam_df = spam_df.sample(1499)
data = pd.concat([spam_df, ham_df])
data = data.sample(frac=1)
data = data.reset_index(drop=True)

data_TF=data

In [5]:
data.head()

Unnamed: 0,label,text,class
0,ham,Subject: inactivations\r\ncheryl johnson\r\n08...,0
1,ham,Subject: re : cornhusker contact information\r...,0
2,spam,Subject: my testimonial about skuper viakgra l...,1
3,spam,Subject: seagate 20 / 40 gb dat tape drive ( i...,1
4,spam,Subject: find out where to buy drugs cheap\r\n...,1


In [6]:
#Stopword Removal
stop_words = set(stopwords.words('english'))

data['text'] = data['text'].apply(lambda x: ' '.join([ word for word in word_tokenize(x)  if not word in stop_words]))


In [7]:
X = data.loc[:, 'text']
y = data.loc[:, 'class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=11)

In [8]:
print(f"Train Data Shape: {X_train.shape}\nTest Data Shape: {X_test.shape}")

Train Data Shape: (2398,)
Test Data Shape: (600,)


In [9]:
CVect = CountVectorizer()
TFVect = TfidfVectorizer()

X_Train_Vector_CV = CVect.fit_transform(X_train)
X_Test_Vector_CV = CVect.transform(X_test)



In [10]:
gnb = GaussianNB()
X_Train_array_CV = X_Train_Vector_CV.toarray()
gnb.fit(X_Train_array_CV, y_train)

GaussianNB()

In [11]:
X_Test_array_CV = X_Test_Vector_CV.toarray()
gnb.score(X_Test_array_CV, y_test)

0.945

In [18]:
f1_score(y_test, gnb.predict(X_Test_array_CV), average = None)

array([0.95145631, 0.94845361])

In [13]:
lr = LogisticRegression()
lr.fit(X_Train_Vector_CV, y_train)

LogisticRegression()

In [14]:
lr.score(X_Test_Vector_CV, y_test)

0.9733333333333334

In [15]:
test_mail = ["""Ever wonder what it would be like to play with a REAL LIVE DEALER from the comfort of your home or even while you wait in line at the doctor\'s office?

Now you can play anywhere! Sign up and start playing like a pro today!

PLAY  NOW
If you prefer not to receive further communication please Unsubscribe Here

 
If you do not wish to receive this type of Newsletter, please unsubscribe by clicking this Unsubscribe link

"""]

test_mail_vec = CVect.transform(test_mail)
test_mail_vec = test_mail_vec.toarray()

gnb.predict(test_mail_vec)[0]



1

In [16]:
X_tf = data_TF.loc[:, 'text']
y_tf = data_TF.loc[:, 'class']
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, y_tf, test_size=0.20, random_state=11)

X_Train_Vector_TF = TFVect.fit_transform(X_train_tf)
X_Test_Vector_TF = TFVect.transform(X_test_tf)

X_Train_Vector_TF = X_Train_Vector_TF.toarray()
gnb.fit(X_Train_Vector_TF, y_train_tf)

GaussianNB()

In [17]:
X_Test_Vector_TF = X_Test_Vector_TF.toarray()
gnb.score(X_Test_Vector_TF, y_test_tf)

0.93