In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Loading and Observing the Dataset

In [2]:
df = pd.read_csv('spam_ham_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
df.shape

(5171, 4)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


No Missing Values


In [5]:
df.isna().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

**Exploratory Data Analysis**

In [6]:
df['label_num'].value_counts()

label_num
0    3672
1    1499
Name: count, dtype: int64

Checking the Length of email and it's relation

In [7]:
from nltk import word_tokenize

Function that tokenizes each and every email into words and returns it's length

In [8]:
def count_words(text):
    words = word_tokenize(text)
    return len(words)

Applying the function to df['text'] and storing the count in another column

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
df['count']=df['text'].apply(count_words)

In [11]:
df['count']

0        68
1        24
2       551
3        49
4        71
       ... 
5166    156
5167    306
5168     79
5169    112
5170    200
Name: count, Length: 5171, dtype: int64

In [12]:
df.groupby('label_num')['count'].mean()

label_num
0    226.239107
1    236.387592
Name: count, dtype: float64

# Text Prepreocessing

**Function to Process the text data and 1. Remove Punctuation 2.Stop Words 3.Stemming**

In [13]:
import string
from nltk.corpus import stopwords


In [14]:
def process_text(text):
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = ''.join(no_punc)


    return ' '.join([word for word in no_punc.split() if word.lower() not in stopwords.words('english')])

In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ela\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
df['text']=df['text'].apply(process_text)

In [17]:
df['text']

0       Subject enron methanol meter 988291 follow not...
1       Subject hpl nom january 9 2001 see attached fi...
2       Subject neon retreat ho ho ho around wonderful...
3       Subject photoshop windows office cheap main tr...
4       Subject indian springs deal book teco pvr reve...
                              ...                        
5166    Subject put 10 ft transport volumes decreased ...
5167    Subject 3 4 2000 following noms hpl take extra...
5168    Subject calpine daily gas nomination julie men...
5169    Subject industrial worksheets august 2000 acti...
5170    Subject important online banking alert dear va...
Name: text, Length: 5171, dtype: object

**After cleaning the text. We will now carry out the process of Stemming to reduce infected words to their root**

In [18]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [19]:
def stemming(text):
    return ''.join([stemmer.stem(word) for word in text])

In [20]:
df['text']=df['text'].apply(stemming)

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,count
0,605,ham,subject enron methanol meter 988291 follow not...,0,68
1,2349,ham,subject hpl nom january 9 2001 see attached fi...,0,24
2,3624,ham,subject neon retreat ho ho ho around wonderful...,0,551
3,4685,spam,subject photoshop windows office cheap main tr...,1,49
4,2030,ham,subject indian springs deal book teco pvr reve...,0,71


**Now we will use Count Vectorizer to convert string data into Bag of Words ie Known Vocabulary**

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
import pandas as pd

# Assuming df is your DataFrame with 'text' and 'label_num' columns

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
message_tfidf = vectorizer.fit_transform(df['text'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(message_tfidf, df['label_num'], test_size=0.20)

# Train the Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# # Save the trained model using joblib
# joblib.dump(nb, 'naive_bayes_model.pkl')

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       741
           1       1.00      0.72      0.84       294

    accuracy                           0.92      1035
   macro avg       0.95      0.86      0.89      1035
weighted avg       0.93      0.92      0.92      1035

