In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
df = pd.read_csv('Spam Email raw text for NLP.csv')

In [18]:
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [19]:
mail_data = df.where((pd.notnull(df)),'')

In [20]:
mail_data.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [21]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(5796, 3)

In [22]:
# separating the data as texts and label

X = mail_data['MESSAGE']

Y = mail_data['CATEGORY']
X,Y

(0       Dear Homeowner,\n\n \n\nInterest Rates are at ...
 1       ATTENTION: This is a MUST for ALL Computer Use...
 2       This is a multi-part message in MIME format.\n...
 3       IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
 4       This is the bottom line.  If you can GIVE AWAY...
                               ...                        
 5791    I'm one of the 30,000 but it's not working ver...
 5792    Damien Morton quoted:\n\n>W3C approves HTML 4 ...
 5793    On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...
 5794    Once upon a time, Manfred wrote :\n\n\n\n> I w...
 5795    If you run Pick, and then use the "New FTOC" b...
 Name: MESSAGE, Length: 5796, dtype: object,
 0       1
 1       1
 2       1
 3       1
 4       1
        ..
 5791    0
 5792    0
 5793    0
 5794    0
 5795    0
 Name: CATEGORY, Length: 5796, dtype: int64)

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

Feature Extraction

In [25]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Training the Model

Logistic Regression

In [26]:
model = LogisticRegression()

In [27]:
model.fit(X_train_features, Y_train)

In [29]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9851164797238999


In [31]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.971551724137931


In [37]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:\n',confusion_matrix(Y_test, prediction_on_test_data))
from sklearn.metrics import classification_report
print('Calssification Report:\n',classification_report(Y_test, prediction_on_test_data))

Confusion Matrix:
 [[779   5]
 [ 27 349]]
Calssification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       784
           1       0.99      0.93      0.96       376

    accuracy                           0.97      1160
   macro avg       0.98      0.96      0.97      1160
weighted avg       0.97      0.97      0.97      1160



Random Forest Classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train_features, Y_train)


In [39]:
# prediction on training data

rf_prediction_on_training_data = rf_model.predict(X_train_features)
rf_accuracy_on_training_data = accuracy_score(Y_train, rf_prediction_on_training_data)

print('Accuracy on training data : ', rf_accuracy_on_training_data)

Accuracy on training data :  1.0


In [40]:
# prediction on test data

rf_prediction_on_test_data = rf_model.predict(X_test_features)
rf_accuracy_on_test_data = accuracy_score(Y_test, rf_prediction_on_test_data)

print('Accuracy on test data : ', rf_accuracy_on_test_data)

Accuracy on test data :  0.9724137931034482


In [41]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:\n',confusion_matrix(Y_test, rf_prediction_on_test_data))
from sklearn.metrics import classification_report
print('Calssification Report:\n',classification_report(Y_test, rf_prediction_on_test_data))


Confusion Matrix:
 [[778   6]
 [ 26 350]]
Calssification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       784
           1       0.98      0.93      0.96       376

    accuracy                           0.97      1160
   macro avg       0.98      0.96      0.97      1160
weighted avg       0.97      0.97      0.97      1160

