<a href="https://colab.research.google.com/github/sarjil77/spam-mail-using-ML/blob/main/spam_mail_prediction_corrected.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
mail_data = pd.read_csv('/content/mail_data.csv')

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# replace null values with a null string
mail_data = mail_data.where((pd.notnull(mail_data)),'')

In [None]:
mail_data.shape

(5572, 2)

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# label encoding
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# splitting the feature and target
X = mail_data.drop(columns = 'Category')
Y = mail_data['Category']

In [None]:
print(X)

                                                Message
0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup fina...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
...                                                 ...
5567  This is the 2nd time we have tried 2 contact u...
5568               Will ü b going to esplanade fr home?
5569  Pity, * was in mood for that. So...any other s...
5570  The guy did some bitching but I acted like i'd...
5571                         Rofl. Its true to its name

[5572 rows x 1 columns]


In [None]:
# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 12)

In [None]:
X_train.shape

(4457, 1)

In [None]:
X_test


Unnamed: 0,Message
3134,So no messages. Had food?
5037,You won't believe it but it's true. It's Incre...
785,Dont think so. It turns off like randomlly wit...
730,Dunno y u ask me.
355,&lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
...,...
3156,Ok...
1552,In e msg jus now. U said thanks for gift.
860,Did he just say somebody is named tampa
5071,5p 4 alfie Moon's Children in need song on ur ...


In [None]:
# converting the dataframe into the list
X_train = X_train.iloc[:, 0].tolist()


In [None]:
# converting the dataframe into the list
X_test = X_test.iloc[:, 0].tolist()

In [None]:
print(type(X_test))

<class 'list'>


In [None]:
vectorizer = CountVectorizer(min_df = 1, stop_words='english',lowercase = True)

In [None]:
print(type(vectorizer))

<class 'sklearn.feature_extraction.text.CountVectorizer'>


In [None]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
X_test_vectorized.data

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
print(X_test_vectorized)

  (0, 2808)	1
  (0, 4274)	1
  (1, 906)	1
  (1, 1242)	1
  (1, 1325)	1
  (1, 3888)	1
  (1, 4300)	1
  (1, 5476)	1
  (1, 6551)	1
  (1, 6747)	1
  (1, 6749)	1
  (1, 6784)	1
  (1, 6791)	1
  (1, 7233)	1
  (2, 542)	1
  (2, 2312)	1
  (2, 3942)	1
  (2, 6552)	1
  (2, 6774)	1
  (3, 1040)	1
  (3, 2392)	1
  (4, 848)	1
  (4, 3124)	1
  (4, 3593)	1
  (4, 4073)	1
  :	:
  (1113, 1703)	1
  (1113, 4107)	1
  (1113, 4355)	1
  (1113, 4398)	1
  (1113, 4527)	1
  (1113, 4605)	1
  (1113, 5064)	1
  (1113, 5070)	1
  (1113, 5209)	1
  (1113, 6051)	1
  (1113, 6476)	1
  (1113, 6659)	1
  (1113, 6784)	1
  (1113, 6879)	2
  (1113, 7382)	1
  (1114, 2484)	1
  (1114, 3156)	1
  (1114, 3352)	1
  (1114, 3367)	1
  (1114, 3771)	1
  (1114, 3902)	1
  (1114, 3942)	1
  (1114, 4574)	1
  (1114, 6388)	1
  (1114, 6552)	1


In [None]:
print(Y_train)

1562    1
3362    1
3686    1
2457    1
353     1
       ..
3916    1
1283    1
3714    1
3325    1
1414    0
Name: Category, Length: 4457, dtype: object


Training the Logistic Model

In [None]:
model = LogisticRegression()

In [None]:
# converting Y_train and Y_test to integers
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

In [None]:
model.fit(X_train_vectorized, Y_train)

In [None]:
# accuracy on training data
X_pred = model.predict(X_train_vectorized)
training_data_accuracy = accuracy_score(Y_train, X_pred)
training_data_accuracy

0.9950639443571909

In [None]:
# fitting the test data
X_test_pred = model.predict(X_test_vectorized)
testing_data_accuracy = accuracy_score(Y_test, X_test_pred)
testing_data_accuracy

0.9757847533632287

Building a predictive system

In [None]:
input_mail = ["Your account has been credited with 500 FREE Text Messages. To activate, just txt the word: CREDIT to No: 80488 T&Cs www.80488.biz"]

# convert the text value to numerical value using feature vector
input_data_features = vectorizer.transform(input_mail)

# making prediction
prediction = model.predict(input_data_features)
print(prediction)


if prediction[0] == 0 :
  print('Mail is spam mail')
else:
  print('Mail is ham mail')

[0]
Mail is spam mail


Saving the trained model

In [None]:
import pickle

In [None]:
filename = "spam_mail_detect.sav"
pickle.dump(model, open(filename, 'wb' ))

In [None]:
# loading the saved model
loaded_mail = pickle.load(open('spam_mail_detect.sav','rb'))

In [None]:
# prediction system
input_mail = ["No problem. How are you doing?"]

# convert the text value to numerical value using feature vector
input_data_features = vectorizer.transform(input_mail)

# making prediction
prediction = model.predict(input_data_features)

# print(prediction)


if prediction[0] == 0 :
  print('Mail is spam mail')
else:
  print('Mail is ham mail')

Mail is ham mail
