# $Classification$ $of$ $spam$ $Mails$

In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('emails.txt')

In [3]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
data['spam'].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

## $Text$ $Cleaning$

In [5]:
import nltk
import re
from nltk.tokenize import word_tokenize

In [6]:
stopwords= nltk.corpus.stopwords.words("english")
ps= nltk.PorterStemmer()

In [7]:
def clean_txt_new(sent):
    #Stripping white spaces before and after the text
    sent = sent.strip()
    # Removing Subject from the starting of each text
    sent = sent.lstrip('Subject')
    #Replacing multiple spaces with a single space
    result = re.sub("\s+", " ", sent)
    #Replacing Non-Alpha-numeric and non space charachters with nothing
    result1 = re.sub("[^\w\s]","",result)
    #Normalize case and remove shorter tokens
    tokens = word_tokenize(result1.lower())
    stem_tokens = [ps.stem(word) for word in tokens if word not in stopwords and len(word) > 2] 
    #Joining all to form a single string which will be returned from the UDF       
    res = " ".join(stem_tokens)
    return res

In [8]:
data['text_cleaned'] = data['text'].apply(lambda x: clean_txt_new(x))
data.head()

Unnamed: 0,text,spam,text_cleaned
0,Subject: naturally irresistible your corporate...,1,natur irresist corpor ident realli hard recoll...
1,Subject: the stock trading gunslinger fanny i...,1,stock trade gunsling fanni merril muzo colza a...
2,Subject: unbelievable new homes made easy im ...,1,unbeliev new home made easi want show homeown ...
3,Subject: 4 color printing special request add...,1,color print special request addit inform click...
4,"Subject: do not have money , get software cds ...",1,money get softwar cd softwar compat great grow...


In [9]:
x=data['text_cleaned'].values
y=data['spam'].values

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 3500)
x_tfidf_train = tfidf.fit_transform(x_train)
x_tfidf_test = tfidf.transform(x_test)

In [12]:
x_tfidf_train_df = pd.DataFrame(x_tfidf_train.toarray())
x_tfidf_test_df = pd.DataFrame(x_tfidf_test.toarray())
x_tfidf_train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3490,3491,3492,3493,3494,3495,3496,3497,3498,3499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, auc
def model_training(x_train,y_train,x_test, y_test,model):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test,y_pred)
    cm = confusion_matrix(y_test,y_pred)
    return(acc,f1,cm)

In [14]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
a, f, cm = model_training(x_tfidf_train_df, y_train, x_tfidf_test_df, y_test, mnb)
print(round(a*100,2))
print(round(f*100,2))
print(cm)

97.91
95.29
[[879   6]
 [ 18 243]]


## $Testing$

In [46]:
df = pd.DataFrame([["Grab the offer this Christmas!! Flat 40% off sale on all men's accessories and clothing.Hurry!! Offer valid till 27th December"],
                   ['Hey Ranjan!! offer offer!! just for you grab it up'],
                   ["Hi! Please call when you are free. It's urgent"],
                   ["Grab the offer!! Flat 50% sale on all men's product"],
                   ['Hello!! Offer offer offer!! just for you grab it'],
                   ['Hi! Please call when free'],
                  ["but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits"]], columns = ['Text'])
df

Unnamed: 0,Text
0,Grab the offer this Christmas!! Flat 40% off s...
1,Hey Ranjan!! offer offer!! just for you grab i...
2,Hi! Please call when you are free. It's urgent
3,Grab the offer!! Flat 50% sale on all men's pr...
4,Hello!! Offer offer offer!! just for you grab it
5,Hi! Please call when free
6,but we do promise that your marketing efforts ...


In [47]:
import string
def clean_text_nostem(text):
    # Stripping white spaces before and after the text
    text = text.strip(" ")
    # Replacing - with space
    text = re.sub('-'," ", text)
    # Replacing multiple spaces with a single space
    text = re.sub("\s+"," ", text)
    # Replacing punctuations
    text = "".join([char for char in text if char not in string.punctuation])
    # Creating tokens
    tokens = re.split('\W+', text)
    # removing stopwords
    text_final = [word for word in tokens if word not in stopwords and len(word)>2]
    # creating a list of tokens
    text_final = " ".join(text_final)
    return text_final

In [48]:
df['clean_text'] = df['Text'].apply(lambda x: clean_text_nostem(x.lower()))
inp = tfidf.transform(df['clean_text'])
inp_df = pd.DataFrame(inp.toarray())
inp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3490,3491,3492,3493,3494,3495,3496,3497,3498,3499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
a = mnb.predict(inp_df)
a

array([1, 1, 0, 1, 1, 0, 0], dtype=int64)

In [50]:
c = 0
for i in a:
    if i == 1:
        print(df['Text'][c])
        print("It is a Spam Message")
    else:
        print("Not a Spam Message")
    c+=1

Grab the offer this Christmas!! Flat 40% off sale on all men's accessories and clothing.Hurry!! Offer valid till 27th December
It is a Spam Message
Hey Ranjan!! offer offer!! just for you grab it up
It is a Spam Message
Not a Spam Message
Grab the offer!! Flat 50% sale on all men's product
It is a Spam Message
Hello!! Offer offer offer!! just for you grab it
It is a Spam Message
Not a Spam Message
Not a Spam Message
