In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
raw_mail_data = pd.read_csv('/content/emails.csv.zip')

In [10]:
print(raw_mail_data)

                                                   text  spam
0     Subject: naturally irresistible your corporate...     1
1     Subject: the stock trading gunslinger  fanny i...     1
2     Subject: unbelievable new homes made easy  im ...     1
3     Subject: 4 color printing special  request add...     1
4     Subject: do not have money , get software cds ...     1
...                                                 ...   ...
5723  Subject: re : research and development charges...     0
5724  Subject: re : receipts from visit  jim ,  than...     0
5725  Subject: re : enron case study update  wow ! a...     0
5726  Subject: re : interest  david ,  please , call...     0
5727  Subject: news : aurora 5 . 2 update  aurora ve...     0

[5728 rows x 2 columns]


In [11]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [13]:
mail_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [14]:
mail_data.shape

(5728, 2)

LABEL ENCODING


In [17]:
mail_data.loc[mail_data['spam'] == '0', 'spam',] = 0
mail_data.loc[mail_data['spam'] == '1', 'spam',] = 1

In [18]:
X = mail_data['text']

Y = mail_data['spam']

In [19]:
print(X)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object


In [20]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64


In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [22]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5728,)
(4582,)
(1146,)


In [24]:
# transform the text data to feature vectors that can be used as input to the Logistic regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase= True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [25]:
print(X_train)

5700    Subject: re : exploration data as the root of ...
5105    Subject: gas model  sorry so much time has pas...
5485    Subject: livelink access  - - - - - - - - - - ...
5192    Subject: re : video conference scheduling  hel...
2651    Subject: re : pending approval for ibuyit requ...
                              ...                        
789     Subject: cigarettes wholesale ! hywwzzlzd  $ 1...
968     Subject: i think you might be interested  hell...
1667    Subject: re : summer work . .  jinbaek ,  this...
3321    Subject: re : book for lacima course attendees...
1688    Subject: re : argentina modelling  michael ,  ...
Name: text, Length: 4582, dtype: object


In [26]:
print(X_train_features)

  (0, 1631)	0.11948158504424804
  (0, 2175)	0.1139366811829867
  (0, 2270)	0.09287045505205638
  (0, 7353)	0.0655788157361805
  (0, 15304)	0.0663995869621661
  (0, 28666)	0.06431166772110039
  (0, 28707)	0.06016733162188114
  (0, 12928)	0.06467963832229102
  (0, 21474)	0.05636789397124751
  (0, 21089)	0.03528233862778253
  (0, 33086)	0.043228163741262066
  (0, 14600)	0.044761414388067036
  (0, 31687)	0.05286353166660429
  (0, 8779)	0.15915971536413323
  (0, 8929)	0.2046990616246818
  (0, 31574)	0.08608147431672733
  (0, 29719)	0.09538081914466627
  (0, 15873)	0.06034727389686767
  (0, 24009)	0.059727569756054304
  (0, 25478)	0.100523433523679
  (0, 3685)	0.1139366811829867
  (0, 30510)	0.07784647543051056
  (0, 15183)	0.07546212040150725
  (0, 32265)	0.044560090228177425
  (0, 6690)	0.06742389347978599
  :	:
  (4581, 429)	0.07532196662652381
  (4581, 18624)	0.062223221403916826
  (4581, 32353)	0.10192506274469543
  (4581, 4987)	0.11815520710746134
  (4581, 16993)	0.10310140246650003
  

In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train_features, Y_train)

In [29]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [31]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9965080750763858


In [32]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [33]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9834205933682374


In [39]:
input_mail = ["l players into confident competitors .  go to winningstate . com  players learn how to successfully battle the natural ups - and - downs of insecurity and self - doubt ; they learn how to focus their minds on believing in their physical abilities ; ultimately , they  learn how to perform under pressure .  ? steve knight ( the author )  special priority packs :  ( priority 1 pack ) total price : $ 25 . 90  ( priority 3 pack ) total price : $ 59 . 90  " 'the best confidence book we have ever read ! this is a must  read if you want to be a cut above the average player . '"  high school sports news "]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==0):
  print('Ham mail')

else:
  print('Spam mail')

[1]
Spam mail
