# Email Spam Classification 

In [92]:
## first import all the libraries
import numpy as np
import pandas as pd
import os

workin_path = os.getcwd().replace('\\notebook', '').replace("\\", "/")
file_path = workin_path + "/data/spam.csv"

# read the csv file
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### ***Data Wrangling***

In [77]:
## first i will remove the extra columns which in my case is the "Unnamed" 
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [78]:
## rename the columns name 
df.rename(columns={'v1':'label', 'v2':'message'}, inplace=True)
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [79]:
#  here i will check na values in the dataset
df.isna().sum()

label      0
message    0
dtype: int64

In [80]:
## so here i will check the duplicate values in the dataset
df.duplicated().sum()

403

In [81]:
## so here i will drop the all dupllicate values and then will check the shape of the data
df.drop_duplicates(inplace=True)
df.shape

## after dropping the duplicates, we need to reset the index
df.reset_index(drop=True, inplace=True)

In [82]:
## need to seprate the into two part one is ouput and other is input
y  = df['label'].apply(lambda x: 1 if x == 'spam' else 0)
X = df['message']

## Split data into training and testing sets

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print("Train data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

Train data shape: (3876,)
Test data shape: (1293,)


## Data Pre-Processing and Cleaning

In [84]:
# import the necessary packages
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def check_and_download_stopwords():
    try:
        nltk.data.find('corpora/stopwords')
        nltk.data.find('corpora/wordnet')
    except LookupError:
        # If not found, download stopwords
        nltk.download('stopwords')
        nltk.download('wordnet')

class Preprocess :
    def __init__(self) -> None:
        check_and_download_stopwords()
        self.stemmer = PorterStemmer()

    def pre_processing(self, messages):
        corpus = []
        for document in messages:
            review = re.sub('[^a-zA-Z]',' ', document)
            review = review.lower()
            review = review.split()
            review = [self.stemmer.stem(word) for word in review if word not in stopwords.words('english')]
            review =' '.join(review)
            corpus.append(review)
        return corpus

## Model Creation

In [85]:
## here i will pick the random forest model
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


preprocess = Preprocess()

# Step 1: Wrap the custom function with FunctionTransformer
preprocessing_transformer = FunctionTransformer(preprocess.pre_processing , validate=False)


# Step 2: Create the pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessing_transformer),
    ('word2vec', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
## here i will use the pipeline method to train the model
pipeline.fit(X_train,y_train)

In [87]:
## here we test the our model
y_pred = pipeline.predict(X_test)

In [88]:
from sklearn.metrics import accuracy_score, confusion_matrix

score = accuracy_score(y_test, y_pred)
print("Accuracy Score:-", score)


# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:-\n", cm)

Accuracy Score:- 0.9767981438515081

Confusion Matrix:-
 [[1106    1]
 [  29  157]]


## Lets Build the model for the fourther prediction

In [89]:
import joblib
## here we are saving the pipeline
joblib.dump(pipeline, open(workin_path+'/models/model.pkl', 'wb'))