In [18]:
# importing dependencies

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [19]:
# loading data to a pandas dataframe
raw_df = pd.read_csv('/content/drive/MyDrive/ML Project Datasets/mail_data.csv')

In [20]:
raw_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [21]:
# replace null vaues with null strings

filled_df = raw_df.where((pd.notnull(raw_df)),'')

In [22]:
filled_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
filled_df.shape

(5572, 2)

In [24]:
filled_df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [25]:
# Label spam as 0 and ham as 1

filled_df.loc[filled_df['Category'] == 'spam', 'Category'] = 0
filled_df.loc[filled_df['Category'] == 'ham', 'Category'] = 1

In [26]:
# Seperating Features and Targets

X = filled_df['Message']
y = filled_df['Category']

In [27]:
X

Unnamed: 0,Message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."
...,...
5567,This is the 2nd time we have tried 2 contact u...
5568,Will ü b going to esplanade fr home?
5569,"Pity, * was in mood for that. So...any other s..."
5570,The guy did some bitching but I acted like i'd...


In [28]:
y

Unnamed: 0,Category
0,1
1,1
2,0
3,1
4,1
...,...
5567,0
5568,1
5569,1
5570,1


In [29]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=2)

In [30]:
# converting text into numerical values

f_ext = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train = f_ext.fit_transform(X_train)
X_test = f_ext.transform(X_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [32]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34895 stored elements and shape (4457, 7496)>
  Coords	Values
  (0, 4768)	0.2885879313347367
  (0, 7438)	0.2996693624522654
  (0, 2262)	0.49316930861935127
  (0, 3764)	0.22046319970004669
  (0, 2823)	0.5172500796081709
  (0, 7289)	0.5172500796081709
  (1, 3317)	0.3290434493347565
  (1, 4972)	0.49481520325330874
  (1, 1558)	0.42364007209989546
  (1, 6517)	0.49481520325330874
  (1, 4136)	0.4717788963273523
  (2, 3103)	0.17628376831968728
  (2, 841)	0.26799944639874834
  (2, 4099)	0.186263215205624
  (2, 3086)	0.27449720225122765
  (2, 2136)	0.180851695270251
  (2, 3398)	0.20665621299033204
  (2, 4269)	0.2543939099135892
  (2, 3118)	0.18009671431232455
  (2, 3935)	0.3671145612703168
  (2, 3722)	0.24768901862403342
  (2, 6641)	0.20096909705626312
  (2, 1430)	0.28509060215711635
  (2, 5837)	0.1845655907506494
  (2, 4943)	0.33789703751914013
  :	:
  (4454, 841)	0.21705430485365426
  (4454, 3514)	0.17954863693268575
  (4454, 7163)	

In [33]:
# training logistic regression model

model = LogisticRegression()

model.fit(X_train, y_train)


In [34]:
# evaluating on trained data

train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(train_predictions, y_train)
train_accuracy

0.9672425398249944

In [35]:
# evaluating on test data

test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(test_predictions, y_test)
test_accuracy

0.9704035874439462

In [45]:
# build a predictor function with the trained model

def spam_mail_predictor(email):
  x = f_ext.transform(email)
  y = model.predict(x)
  if y[0] == 0:
    return 'spam'
  else:
    return 'ham'

spam_mail_predictor(["Hey Srikara, I wanted to check in and ensure you’re all set with your Dashboard. Now, here’s the exciting part, Topmate isn’t just about finding experts… it’s about finding the right people who’ve walked the path you’re on. Meet some of our amazing mentors: 1. Prerna Sharma - Cracked the finance world and now helps people break into top firms"])

'ham'