Importing the dependencies

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import  LogisticRegression

Data Collection and Pre-Proceessing

In [8]:
mail_df=pd.read_csv("/content/email_combined_data.csv")

In [9]:
mail_df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [10]:
print(mail_df)

       label                                               text
0          1  ounce feather bowl hummingbird opec moment ala...
1          1  wulvob get your medircations online qnb ikud v...
2          0   computer connection from cnn com wednesday es...
3          1  university degree obtain a prosperous future m...
4          0  thanks for all your answers guys i know i shou...
...      ...                                                ...
27265      1  support little writeoff response unionfiled si...
27266      1  although replescapenumberca wescapenumbertches...
27267      1  get it before the rush special situation alert...
27268      1  brake when returning bucket home with harmony ...
27269      1  thirty and as big and black as a villain in a ...

[27270 rows x 2 columns]


In [11]:
mail_df.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [12]:
# checking the number of rows and columns in the dataframe
mail_df.shape

(27270, 2)

Label Encoding

1 --> Spam Mail

0 --> Ham Mail

In [13]:
mail_df['label'].value_counts()  # checking whether the dataset is imbalanced or not.

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,14391
0,12879


In [14]:
X = mail_df['text']
Y = mail_df["label"]

In [15]:
print(X)

0        ounce feather bowl hummingbird opec moment ala...
1        wulvob get your medircations online qnb ikud v...
2         computer connection from cnn com wednesday es...
3        university degree obtain a prosperous future m...
4        thanks for all your answers guys i know i shou...
                               ...                        
27265    support little writeoff response unionfiled si...
27266    although replescapenumberca wescapenumbertches...
27267    get it before the rush special situation alert...
27268    brake when returning bucket home with harmony ...
27269    thirty and as big and black as a villain in a ...
Name: text, Length: 27270, dtype: object


In [16]:
print(Y)

0        1
1        1
2        0
3        1
4        0
        ..
27265    1
27266    1
27267    1
27268    1
27269    1
Name: label, Length: 27270, dtype: int64


In [17]:
# spliting the data into training and test data

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [18]:
print(X.shape,X_train.shape,X_test.shape)

(27270,) (21816,) (5454,)


In [19]:
print(X_train)

23532    ultimately the original thing  with no money ...
20026    we agree\n" eileen ponton " on 05 / 02 / 2001 ...
15533    keep your immune system strong\nhttp : / / cro...
16577     computer connection from cnn com wednesday es...
4119     ben finney writes stephen gran writes escapenu...
                               ...                        
15288    hot alert special situation alert tmxo trimax ...
26243    well i'm not sure i may be explaining this bad...
11513    from : the managing director\neuropean prize a...
1688     dear valued member with this special pharmaceu...
5994     hi all i want to get all the services installe...
Name: text, Length: 21816, dtype: object


Feature Extraction

In [20]:
# Transform the text data to feature vectors that can be used as input to the Logistic Regression Model
feature_extraction= TfidfVectorizer(min_df=1 , stop_words='english',lowercase= True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test as integers

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [21]:
print(X_train)

23532    ultimately the original thing  with no money ...
20026    we agree\n" eileen ponton " on 05 / 02 / 2001 ...
15533    keep your immune system strong\nhttp : / / cro...
16577     computer connection from cnn com wednesday es...
4119     ben finney writes stephen gran writes escapenu...
                               ...                        
15288    hot alert special situation alert tmxo trimax ...
26243    well i'm not sure i may be explaining this bad...
11513    from : the managing director\neuropean prize a...
1688     dear valued member with this special pharmaceu...
5994     hi all i want to get all the services installe...
Name: text, Length: 21816, dtype: object


In [22]:
print(X_train_features)

  (0, 127236)	0.136180428188365
  (0, 90362)	0.15500839292297516
  (0, 122779)	0.17098594179835733
  (0, 82056)	0.07405943497800513
  (0, 125430)	0.15662505955377493
  (0, 60777)	0.10058617220788325
  (0, 123378)	0.05532971007264406
  (0, 49209)	0.17975256799718095
  (0, 90371)	0.17102546785830128
  (0, 124499)	0.12828126386453179
  (0, 128100)	0.11071792482267136
  (0, 118257)	0.20917649192299942
  (0, 14738)	0.07283735879428094
  (0, 93055)	0.08966916973333001
  (0, 14115)	0.10203233612826752
  (0, 69031)	0.058653136196405053
  (0, 93736)	0.07138879939607615
  (0, 109397)	0.08149500180331717
  (0, 98766)	0.1681884643235368
  (0, 74150)	0.05556828131684126
  (0, 119812)	0.1594242811166173
  (0, 8481)	0.1331545876505329
  (0, 20094)	0.14807366387870258
  (0, 42490)	0.07589108550558671
  (0, 120600)	0.1079217379693566
  :	:
  (21814, 65816)	0.15180045291530747
  (21814, 21416)	0.18007537394919945
  (21814, 90180)	0.15221744730302533
  (21814, 111423)	0.18047963518140658
  (21814, 51992)

Logistic Regression

In [23]:
model=LogisticRegression()

In [24]:
# training the logistic regression model with the training data

model.fit(X_train_features,Y_train)

Evaluating the trained model

In [25]:
# prediction on training data

prediction_training_data= model.predict(X_train_features)
accuracy__training_data = accuracy_score(Y_train,prediction_training_data)

In [26]:
print(accuracy__training_data)

0.9883571690502384


In [27]:
# prediction on test data

prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(Y_test,prediction_test_data)

In [28]:
print(accuracy_test_data)

0.9794646131279795


In [29]:
# Calculate precision, recall, and F1-score

precision_test_data = precision_score(Y_test, prediction_test_data)
recall_test_data = recall_score(Y_test, prediction_test_data)
f1_test_data = f1_score(Y_test, prediction_test_data)

# Calculate confusion matrix
conf_matrix_test_data = confusion_matrix(Y_test, prediction_test_data)

In [30]:
print(f'Precision: {precision_test_data}')
print(f'Recall: {recall_test_data}')
print(f'F1-Score: {f1_test_data}')
print(f'Confusion Matrix:\n{conf_matrix_test_data}')

Precision: 0.9699222710375127
Recall: 0.9920497753197373
F1-Score: 0.9808612440191388
Confusion Matrix:
[[2472   89]
 [  23 2870]]


In [31]:
input_mail=input("Enter your email message: ")

# convert text to feature vectors
input_data_features= feature_extraction.transform([input_mail])  # converting the string to a single element list

# making prediction
prediction = model.predict(input_data_features)
print(prediction)

if(prediction[0]== 1):
  print("This is Spam Mail")

elif(prediction[0]==0):
  print("This is Ham Mail")

Enter your email message: Subject: do not have money , get software cds from here !  software compatibility . . . . ain ' t it great ?  grow old along with me the best is yet to be .  all tradgedies are finish ' d by death . all comedies are ended by
[1]
This is Spam Mail
