In [91]:
import pandas as pd
import numbers as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [92]:
df = pd.read_csv('Email_Dataset.csv')
df = df.drop(columns='Unnamed: 0')
df

Unnamed: 0,CATEGORY,MESSAGE
0,Spam,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...
1,Spam,------=_NextPart_000_00B0_35C58D0E.D7267B06\n\...
2,Spam,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."
3,Spam,------=_NextPart_000_00E4_86E61E0A.B5488E11\n\...
4,Spam,BARRISTER ADEWALE COKER CHAMBERS\n\nLegal Prac...
...,...,...
3995,Not Spam,-----BEGIN PGP SIGNED MESSAGE-----\n\nHash: SH...
3996,Not Spam,"On Thursday 25 July 2002 06:16 am, Kylus wrote..."
3997,Not Spam,Update of /cvsroot/spamassassin/spamassassin/w...
3998,Not Spam,"On Thu, 2002-08-15 at 10:53, Erik Williamson w..."


In [93]:
encoder = LabelEncoder()
df['CATEGORY'] = encoder.fit_transform(df['CATEGORY'])
# 1 means spam  0 means not spam
df

Unnamed: 0,CATEGORY,MESSAGE
0,1,\n\nThe Internet's Online Pharmacy\n\n\n\nViag...
1,1,------=_NextPart_000_00B0_35C58D0E.D7267B06\n\...
2,1,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont..."
3,1,------=_NextPart_000_00E4_86E61E0A.B5488E11\n\...
4,1,BARRISTER ADEWALE COKER CHAMBERS\n\nLegal Prac...
...,...,...
3995,0,-----BEGIN PGP SIGNED MESSAGE-----\n\nHash: SH...
3996,0,"On Thursday 25 July 2002 06:16 am, Kylus wrote..."
3997,0,Update of /cvsroot/spamassassin/spamassassin/w...
3998,0,"On Thu, 2002-08-15 at 10:53, Erik Williamson w..."


In [94]:
x = df['MESSAGE']
y = df['CATEGORY']
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=5)

In [95]:
convert = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_features = convert.fit_transform(x_train)
x_test_features = convert.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [96]:
model = LogisticRegression()
model.fit(x_train_features, y_train)

In [97]:
y_predict = model.predict(x_test_features)
accuracy = accuracy_score(y_predict, y_test)
accuracy

0.96875

### TEST DATASET


In [99]:
df_test = pd.read_csv("Email Test Data.csv")
x_features = convert.transform(df_test['MESSAGE'])
y_predict = model.predict(x_features)
df_test['CATEGORY'] = y_predict
df_test['CATEGORY'] = df_test['CATEGORY'].replace({1: 'SPAM', 0: 'NOT SPAM'})
df_test

Unnamed: 0.1,Unnamed: 0,MESSAGE,CATEGORY
0,0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",SPAM
1,1,ATTENTION: This is a MUST for ALL Computer Use...,NOT SPAM
2,2,This is a multi-part message in MIME format.\n...,SPAM
3,3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,SPAM
4,4,This is the bottom line. If you can GIVE AWAY...,SPAM
...,...,...,...
995,995,"<HTML><BODY BGCOLOR=3D""#FFFFFF"">\n\n<table wid...",SPAM
996,996,Long time no chat!\n\n\n\nHow have you been? I...,SPAM
997,997,"\n\n\n\nChina's rapid economic growth, as rank...",NOT SPAM
998,998,"<html>\n\n\n\n<body>\n\n\n\n<font size=""2"" PTS...",SPAM


In [None]:
df_test.to_csv('Modified_Email_Test_Data.csv', index=False)