<a href="https://colab.research.google.com/github/priyariyyer/AIML_Projects/blob/main/HamSpamClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Understanding Problem and Objective

> Add blockquote

> Add blockquote





In [None]:
# There are emails which are Ham(important) and Spam(Unimportant) which are received in a mailbox.
# User has to read and categorize manually as Ham or Spam, which is time consuming and error prone.
# Objective of this project is to create a classifier which can help classify emails as Ham or Spam. A sample email text is given along with classification.

## Data Gathering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Hamspam.csv', encoding='latin1')
df.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...


In [None]:
df.shape

(5559, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5559 entries, 0 to 5558
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    5559 non-null   object
 1   text    5559 non-null   object
dtypes: object(2)
memory usage: 87.0+ KB


In [None]:
### Intuitive data modifications
# There are no data modifications required in this case.

In [None]:
### Data Quality Checks

In [None]:
df['type'].unique()

array(['ham', 'spam'], dtype=object)

In [None]:
df['text'].isin(['@','#','$','%','/','^','&','*','!','~']).value_counts()

Unnamed: 0_level_0,count
text,Unnamed: 1_level_1
False,5559


In [None]:
df['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
ham,4812
spam,747


## Data Preparation

In [None]:
# Text Cleaning
!pip install nltk



In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
import re

#create function to cleanse text
def preprocess_text(text):
    cleaned_words = []
    text = text.lower().strip() # Normalize
    text = re.sub(f'[{string.punctuation}]', "", text) # Remove Punctuations
    stop_words = stopwords.words('english') # Remove stopwords
    words = word_tokenize(text) # Tokenize words
    ps = PorterStemmer() # Stemming words

    cleaned_words = [ps.stem(word) for word in words if word not in stop_words]
    return " ".join(cleaned_words) # return as a string


In [None]:
# Cleanse email text
df['cleaned_text'] = df['text'].apply(preprocess_text)
df

Unnamed: 0,type,text,cleaned_text
0,ham,Hope you are having a good week. Just checking in,hope good week check
1,ham,K..give back my thanks.,kgive back thank
2,ham,Am also doing in cbe only. But have to pay.,also cbe pay
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000...",complimentari 4 star ibiza holiday å£10000 cas...
4,spam,okmail: Dear Dave this is your final notice to...,okmail dear dave final notic collect 4 tenerif...
...,...,...,...
5554,ham,You are a great role model. You are giving so ...,great role model give much realli wish day mir...
5555,ham,"Awesome, I remember the last time we got someb...",awesom rememb last time got somebodi high firs...
5556,spam,"If you don't, your prize will go to another cu...",dont prize go anoth custom tc wwwtcbiz 18 150p...
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn...",sm ac jsco energi high u may know 2channel 2da...


In [None]:
### Feature Extraction

In [None]:
# Text Embedding
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

x = tfidf.fit_transform(df['cleaned_text']).toarray()
y = df['type']

In [None]:
x.shape, y.shape

((5559, 8075), (5559,))

In [None]:
# Train & Test Data Preparation
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.30, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3891, 8075), (1668, 8075), (3891,), (1668,))

## Model Creation

In [None]:
## Classification using Logistic Regression Model
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr = model_lr.fit(x_train, y_train)

In [None]:
y_pred = model_lr.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print('Accuracy Score:\n', accuracy_score(y_test, y_pred))
print('Classification Score:\n', classification_report(y_test, y_pred))

Accuracy Score:
 0.9496402877697842
Classification Score:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1457
        spam       0.96      0.63      0.76       211

    accuracy                           0.95      1668
   macro avg       0.96      0.81      0.87      1668
weighted avg       0.95      0.95      0.94      1668



In [None]:
## Classification using Naive Bayes Algorithm
from sklearn.naive_bayes import GaussianNB

model_nb = GaussianNB()
model_nb = model_nb.fit(x_train, y_train)

In [None]:
y_pred = model_nb.predict(x_test)

In [None]:
print('Accuracy score:\n', accuracy_score(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))

Accuracy score:
 0.8878896882494005
Classification report:
               precision    recall  f1-score   support

         ham       0.98      0.89      0.93      1457
        spam       0.53      0.90      0.67       211

    accuracy                           0.89      1668
   macro avg       0.76      0.89      0.80      1668
weighted avg       0.93      0.89      0.90      1668



In [None]:
## Classification using Multinomial Naive Bayes Algorithm
from sklearn.naive_bayes import MultinomialNB

model_mnb = MultinomialNB()
model_mnb = model_mnb.fit(x_train, y_train)

In [None]:
y_pred = model_mnb.predict(x_test)

In [None]:
print('Accuracy Score:\n', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy Score:
 0.959832134292566
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1457
        spam       1.00      0.68      0.81       211

    accuracy                           0.96      1668
   macro avg       0.98      0.84      0.89      1668
weighted avg       0.96      0.96      0.96      1668



In [None]:
## Classification using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf = model_rf.fit(x_train, y_train)

In [None]:
y_pred = model_rf.predict(x_test)

In [None]:
print('Accuracy Score:\n', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy Score:
 0.9730215827338129
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1457
        spam       1.00      0.79      0.88       211

    accuracy                           0.97      1668
   macro avg       0.99      0.89      0.93      1668
weighted avg       0.97      0.97      0.97      1668



In [None]:
## Classification using