# Spam Email Detection ( Naive Bayes using Sklearn )

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
le = LabelEncoder()

In [5]:
data = df.to_numpy()

In [6]:
X = data[:, 1]
y = data[:,0]

In [7]:
X.shape, y.shape

((5572,), (5572,))

In [8]:
tokenizer = RegexpTokenizer("\w+")
sw = set(stopwords.words("english"))
ps = PorterStemmer()

In [9]:
def getstem(review):
    review = review.lower()
    tokens = tokenizer.tokenize(review)   # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(w) for w in removed_stopwords]
    clean_review = ' '.join(stemmed_words)
    return clean_review

In [10]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getstem(doc))
        
    return d

In [11]:
stemmed_doc = getDoc(X)

In [12]:
stemmed_doc[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea å 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [13]:
X

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [14]:
cv = CountVectorizer()

In [15]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [16]:
X = vc.todense()
X

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [18]:
# NB from sklearn

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [20]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.977705274605764

In [21]:
messages = [
    """
    Congrats for your #AirtelThanks GOLD membership
You have access to 10000+ movies & Live TV on Airtel Xstream App Premium, Unlimited #
Claim NOW u.airtel.in/Gold7
    """,
    """Kotak 811 - a bank account that's ready in minutes. Get your account Now http://kotk.in/5pAVIC *T&C apply.""",

    """You will get certificate of internship after completion of program and if you want certificate for project report then you have to submit project report of your choice topic.....
""",
    """We have received your application. Please watch for an email for updates on the status of your application."""
]

In [22]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

In [23]:
y_pred = model.predict(messages)
y_pred

array(['spam', 'spam', 'ham', 'ham'], dtype='<U4')