### Email Spammer

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin-1')

print(df.shape);
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df.rename(columns={'v1':'target','v2':'text'},inplace=True)


(5572, 5)


#### 1.Data Cleaning

In [2]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['target'] = encoder.fit_transform(df['target'])
df.isnull().sum()
df.duplicated().sum()
df = df.drop_duplicates(keep="first")
df.duplicated().sum()
df.shape

(5169, 2)

#### 2.EDA

In [None]:
df['target'].value_counts()

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

df['text'].apply(len)
df['num_char'] = df['text'].apply(len) 

df['num_words'] = df['text'].apply(lambda x :len(nltk.word_tokenize(x)))
df['num_sent'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
df[['num_char','num_words','num_sent']].describe()

# for ham
df[df['target'] == 0][['num_char','num_words','num_sent']].describe()
# for spam
df[df['target'] == 1][['num_char','num_words','num_sent']].describe()

#### 3.Data preprocessing  or text preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
import string

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('dancing')

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()        

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]    
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)
    

transform_text("heelo my name is priyank")
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

#### 4.Word Cloud

In [None]:
import matplotlib.pyplot as plt

from wordcloud import WordCloud
wc = WordCloud(width = 500,height=500,min_font_size=10,background_color="white")

spam_wc = wc.generate(df[df['target'] == 1]["transformed_text"].str.cat(sep = " "))

plt.figure(figsize = (10,6))
plt.imshow(spam_wc)

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

len(spam_corpus)        

from collections import Counter
spam = pd.DataFrame(Counter(spam_corpus).most_common(30))


Ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        Ham_corpus.append(word)    

len(Ham_corpus)
        
from collections import Counter
ham = pd.DataFrame(Counter(Ham_corpus).most_common(30))     

#### 5.Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer 

cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

X = tfidf.fit_transform(df['transformed_text']).toarray()


X.shape
y = df['target'].values
y

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))