Importing The Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pickle

Data Collection & Pre-Processing

In [2]:
# Loading the fata from csv file to pandas DataFrame

raw_mail_data = pd.read_csv(r"C:\Users\sanjo\OneDrive\Desktop\Email_Spam_Detection\mail_data.csv")

In [3]:
print(raw_mail_data)

      Category                                            Message
0         spam  Congratulations! You've been selected for a lu...
1         spam  URGENT: Your account has been compromised. Cli...
2         spam  You've won a free iPhone! Claim your prize by ...
3         spam  Act now and receive a 50% discount on all purc...
4         spam  Important notice: Your subscription will expir...
...        ...                                                ...
10872     spam  Hey little one! Exciting news! Mama and baby a...
10873     spam  Amazing DATA deals on your Pulse Plan today! D...
10874     spam  Special offer just for you! Get 1GB @15 bob va...
10875     spam  NEW ARRIVAL - JUNE 23RD  Dresses @ 300; Kondel...
10876     spam  Coureen, did you know that saving on Timiza in...

[10877 rows x 2 columns]


In [4]:
# Replace a null value with a null string
mail_data = raw_mail_data.where((pd.notnull)(raw_mail_data), '')

In [5]:
# Printing the first five rows of the dataframe
raw_mail_data.head()

Unnamed: 0,Category,Message
0,spam,Congratulations! You've been selected for a lu...
1,spam,URGENT: Your account has been compromised. Cli...
2,spam,You've won a free iPhone! Claim your prize by ...
3,spam,Act now and receive a 50% discount on all purc...
4,spam,Important notice: Your subscription will expir...


In [6]:
# Checking the number of rows and columns in the datafram
mail_data.shape

(10877, 2)

Label Encoding

In [7]:
# label spam mail as 0; and ham mail as 1

mail_data.loc[mail_data['Category'] == 'spam', 'Category', ] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category', ] = 1

spam = 0
ham = 1

In [8]:
# seperating the data as texts and label
X = mail_data['Message']

Y = mail_data['Category'].astype('int')

In [9]:
print(X)

0        Congratulations! You've been selected for a lu...
1        URGENT: Your account has been compromised. Cli...
2        You've won a free iPhone! Claim your prize by ...
3        Act now and receive a 50% discount on all purc...
4        Important notice: Your subscription will expir...
                               ...                        
10872    Hey little one! Exciting news! Mama and baby a...
10873    Amazing DATA deals on your Pulse Plan today! D...
10874    Special offer just for you! Get 1GB @15 bob va...
10875    NEW ARRIVAL - JUNE 23RD  Dresses @ 300; Kondel...
10876    Coureen, did you know that saving on Timiza in...
Name: Message, Length: 10877, dtype: object


In [10]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
10872    0
10873    0
10874    0
10875    0
10876    0
Name: Category, Length: 10877, dtype: int64


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 3)

In [12]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(10877,)
(8701,)
(2176,)


Feature Extraction

In [13]:
# Transform the text to feature feactors that can be used as input to the logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y_train and Y_test values as Integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [14]:
print(X_train)

9273     I love to cuddle! I want to hold you in my str...
8473     And you! Will expect you whenever you text! Ho...
10509                  It means u could not keep ur words.
8182                           Tell me pa. How is pain de.
10751                                                  Yup
                               ...                        
6400              Cancel cheyyamo?and get some money back?
9160     Can you plz tell me the ans. BSLVYL sent via f...
9859     Hi ....My engagement has been fixd on  &lt;#&g...
1688     we have vicodin and anything else save over 50...
5994      He also knows about lunch menu only da. . I know
Name: Message, Length: 8701, dtype: object


In [15]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 306099 stored elements and shape (8701, 48140)>
  Coords	Values
  (0, 28329)	0.27575611879046075
  (0, 14458)	0.5340283994699149
  (0, 46053)	0.22279442593162305
  (0, 23333)	0.3654038959063763
  (0, 41560)	0.36901209462496903
  (0, 7168)	0.4927743232586784
  (0, 37775)	0.2768003043656332
  (1, 18955)	0.4482421204017406
  (1, 42868)	0.3686693952998129
  (1, 23489)	0.3725369632674052
  (1, 21653)	0.45596555564287156
  (1, 43515)	0.5625597759650162
  (2, 29443)	0.6134564731122731
  (2, 44867)	0.4804707499050286
  (2, 46884)	0.626752753549875
  (3, 42694)	0.4930526525905112
  (3, 33087)	0.6442025301080576
  (3, 33137)	0.5847240220615515
  (4, 47785)	1.0
  (5, 28155)	0.44496552952801227
  (5, 14923)	0.39258592754061145
  (5, 43271)	0.22022125339893314
  (5, 14746)	0.41540815963659894
  (5, 19580)	0.589769431168729
  (5, 23384)	0.28105630316109964
  :	:
  (8699, 16524)	0.13579435776223067
  (8699, 13038)	0.127059432864547
  (8699

Training the Model

Logistic Regression

In [16]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [17]:
results = []

In [18]:
# training the  models with the training data

In [19]:
for name, model in models.items():
    model.fit(X_train_features, Y_train)
    predictions = model.predict(X_test_features)

    accuracy = accuracy_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions)
    recall = recall_score(Y_test, predictions)
    conf_matrix = confusion_matrix(Y_test, predictions)

    results.append({
        'Model': name,
        'Accuracy': round(accuracy * 100, 2),
        'Precision': round(precision * 100, 2),
        'Recall': round(recall * 100, 2),
        'Confusion Matrix': conf_matrix
    })

In [20]:
results_df = pd.DataFrame(results)

In [21]:
print(results_df.sort_values(by='Accuracy', ascending=False))

                    Model  Accuracy  Precision  Recall  \
2  Support Vector Machine     95.40      94.43   99.88   
3           Random Forest     94.85      94.09   99.52   
0     Logistic Regression     93.70      92.67   99.64   
4       Gradient Boosting     90.21      89.19   99.22   
1             Naive Bayes     89.29      87.75   99.94   
5     K-Nearest Neighbors     75.00      84.26   82.74   

            Confusion Matrix  
2     [[415, 98], [2, 1661]]  
3    [[409, 104], [8, 1655]]  
0    [[382, 131], [6, 1657]]  
4   [[313, 200], [13, 1650]]  
1    [[281, 232], [1, 1662]]  
5  [[256, 257], [287, 1376]]  


Evaluate the Training Model


In [22]:
best_model = SVC()
best_model.fit(X_train_features, Y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [23]:
pickle.dump(best_model, open('spam_model.pkl', 'wb'))
pickle.dump(feature_extraction, open('vectorizer.pkl', 'wb'))

In [24]:
testing_data_prediction = best_model.predict(X_test_features)
testing_accuracy = accuracy_score(Y_test, testing_data_prediction)