In [1]:
import numpy as np ## scientific computation
import pandas as pd ## loading dataset file
import matplotlib.pyplot as plt ## Visulization
import nltk  ## Preprocessing our text
from nltk.corpus import stopwords ## removing all the stop words
from nltk.stem.porter import PorterStemmer ## stemming of words

In [7]:
df = pd.read_csv("spam_ham_dataset.csv")

In [10]:
data=df.copy()

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [11]:
print(data.shape)  ### Return the shape of data 
print(data.ndim)   ### Return the n dimensions of data
print(data.size)   ### Return the size of data 
print(data.isna().sum())  ### Returns the sum fo all na values
print(data.info())  ### Give concise summary of a DataFrame
print(df.head())  ## top 5 rows of the dataframe
print(df.tail()) ## bottom 5 rows of the dataframe

(5171, 4)
2
20684
Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB
None
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0

# Cleaning The Text

- Line 1: we are importing re library, which is used to perform regex in python.
- Line 2: Define an empty corpus list, that can be used to store all the text after cleaning.
- Line 3: initializing the var length with the length of the data frame.
- Line 4: running a loop from 0 to the length of our data frame.
- Line 5: Removing all characters except the lower alphabet, bigger alphabets, and digits.
- Line 6: Converting the text to lower.
- Line7: Splitting the text by spaces.
- Line 8: creating an object of porter stemmer.
- Line9: Initializing all the stopword in English dictionary to var stopword
- Line 10: Running a loop in the length of the sentence and then for each word in the sentence checking it in stopword and if it does not find in stopword then apply Stemming on to the text and add it to the list.
- Line 11: Just concatenating all the words to make a sentence
- Line 12: appending the sentence to the corpus list
- Line 13: Printing the corpus list.

In [12]:
import re
corpus = []
length = len(df)
for i in range(0,length):
    text = re.sub("[^a-zA-Z0-9]"," ",df["text"][i])
    text = text.lower()
    text = text.split()
    pe = PorterStemmer()
    stopword = stopwords.words("english")
    text = [pe.stem(word) for word in text if not word in set(stopword)]
    text = " ".join(text)
    corpus.append(text)
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Count Vectoriser
In Cleaning Process the next step is to convert the list of the sentence(corpus) into vectors so that we can feed this data into our machine learning model. for converting the text into vectors we are going to use a bag of words which is going to convert the text into binary form.

- Line 1: We are importing the CountVectorizer from sklearn.
- Line 2: Creating an object for the count vectorizer with max features as 35000, means we are only fetching the top 35000 columns.
- Line 3: Using CV we are fitting are corpus and also transforming it into vectors.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=35000)
X = cv.fit_transform(corpus).toarray()

In [18]:
y=df['label_num']

# Modeling and Training

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
##train size 80% and test size 20%

# Creating a model using MultinomialNaiveBayes and fitting it

In [21]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

##### fitting

In [22]:
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Prediction

In [23]:
y_pred=model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

## Generating metics and Model accuracy 

In [24]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
score = accuracy_score(y_test,y_pred)
print(cm)
print(score*100)

[[716  16]
 [ 17 286]] 96.81159420289856


## Pickling both model and count vectoriser
##### here count vectoriser is pickled to make future messages to get count vectorisesd

In [25]:
import pickle
pickle.dump(model, open("spam.pkl", "wb"))

In [26]:
import pickle ## importing pickle used for dumping models
pickle.dump(cv, open('cv.pkl', 'wb')) ## saving to into cv.pkl file