In [120]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/train-dataset.csv')

# shuffling all our data
df = df.sample(frac=1)

# reading only Message_body and labal
df = df[['Message_body','Label']]
df

Unnamed: 0,Message_body,Label
1574,Santa Calling! Would your little ones like a c...,Spam
1672,We tried to contact you re your reply to our o...,Spam
1566,URGENT! Last weekend's draw shows that you hav...,Spam
1401,"Hi, the SEXYCHAT girls are for you to text the...",Spam
732,HELLO PEACH! MY CAKE TASTS LUSH!,Non-Spam
...,...,...
263,Its too late:)but its k.wish you the same.,Non-Spam
128,Ok k..sry i knw 2 siva..tats y i askd..,Non-Spam
1238,Natalie (20 / F) is _ you to be her _. Reply Y...,Spam
1540,"ou guaranteed are the Nokia latest, Phone a iP...",Spam


In [121]:
# reading our test data
df_test = pd.read_csv('/content/train-dataset.csv',encoding='cp1252')
df_test

Unnamed: 0.1,Unnamed: 0,S. No.,Message_body,Label
0,0,1.0,Rofl. Its true to its name,Non-Spam
1,1,2.0,The guy did some bitching but I acted like i'd...,Non-Spam
2,2,3.0,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,3,4.0,Will Ã¼ b going to esplanade fr home?,Non-Spam
4,6,7.0,Huh y lei...,Non-Spam
...,...,...,...,...
1672,920,921.0,We tried to contact you re your reply to our o...,Spam
1673,924,925.0,You are being ripped off! Get your mobile cont...,Spam
1674,938,939.0,Todays Voda numbers ending with 7634 are selec...,Spam
1675,939,940.0,Please call our customer service representativ...,Spam


## Remove Punctuation

In [122]:
#library that contains punctuation
import string
# list of all punctuations we have
print(string.punctuation)

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree


#storing the puntuation free text for both training and testing data
df['clean_msg']= df['Message_body'].apply(lambda x:remove_punctuation(x))
df_test['clean_msg'] = df_test['Message_body'].apply(lambda x:remove_punctuation(x))

df.head()

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,Message_body,Label,clean_msg
1574,Santa Calling! Would your little ones like a c...,Spam,Santa Calling Would your little ones like a ca...
1672,We tried to contact you re your reply to our o...,Spam,We tried to contact you re your reply to our o...
1566,URGENT! Last weekend's draw shows that you hav...,Spam,URGENT Last weekends draw shows that you have ...
1401,"Hi, the SEXYCHAT girls are for you to text the...",Spam,Hi the SEXYCHAT girls are for you to text them...
732,HELLO PEACH! MY CAKE TASTS LUSH!,Non-Spam,HELLO PEACH MY CAKE TASTS LUSH


## Message Lower

In [123]:
df['clean_msg']= df['clean_msg'].apply(lambda x: x.lower())
df_test['clean_msg']= df_test['clean_msg'].apply(lambda x: x.lower())

## Remove Stopwords

Before stopwords removal tokenisation is required

In [124]:
#defining function for tokenization
import re
#whitespace tokenizer
from nltk.tokenize import WhitespaceTokenizer

def tokenization(text):
    tk = WhitespaceTokenizer()
    return tk.tokenize(text)

#applying function to the column for making tokens in both Training and Testing data
df['tokenised_clean_msg']= df['clean_msg'].apply(lambda x: tokenization(x))
df_test['tokenised_clean_msg']= df_test['clean_msg'].apply(lambda x: tokenization(x))

In [125]:
#importing nlp library
import nltk
nltk.download('stopwords')
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')


#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function for removal of stopwords
df['cleaned_tokens']= df['tokenised_clean_msg'].apply(lambda x:remove_stopwords(x))

df_test['cleaned_tokens']= df_test['tokenised_clean_msg'].apply(lambda x:remove_stopwords(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Stemming

In [126]:
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):

  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

# applying function for stemming
df['cleaned_tokens']=df['cleaned_tokens'].apply(lambda x: stemming(x))
df_test['cleaned_tokens']=df_test['cleaned_tokens'].apply(lambda x: stemming(x))

In [127]:
def join_text(text):
  return " ".join(text)

# join text because till now we have tokens
df['final_txt'] = df['cleaned_tokens'].apply(lambda x  : join_text(x))
df_test['final_txt'] = df_test['cleaned_tokens'].apply(lambda x  : join_text(x))


In [128]:
df.head()

Unnamed: 0,Message_body,Label,clean_msg,tokenised_clean_msg,cleaned_tokens,final_txt
1574,Santa Calling! Would your little ones like a c...,Spam,santa calling would your little ones like a ca...,"[santa, calling, would, your, little, ones, li...","[santa, call, would, littl, one, like, call, s...",santa call would littl one like call santa xma...
1672,We tried to contact you re your reply to our o...,Spam,we tried to contact you re your reply to our o...,"[we, tried, to, contact, you, re, your, reply,...","[tri, contact, repli, offer, video, phone, 750...",tri contact repli offer video phone 750 anytim...
1566,URGENT! Last weekend's draw shows that you hav...,Spam,urgent last weekends draw shows that you have ...,"[urgent, last, weekends, draw, shows, that, yo...","[urgent, last, weekend, draw, show, £1000, cas...",urgent last weekend draw show £1000 cash spani...
1401,"Hi, the SEXYCHAT girls are for you to text the...",Spam,hi the sexychat girls are for you to text them...,"[hi, the, sexychat, girls, are, for, you, to, ...","[hi, sexychat, girl, text, great, night, chat,...",hi sexychat girl text great night chat stop st...
732,HELLO PEACH! MY CAKE TASTS LUSH!,Non-Spam,hello peach my cake tasts lush,"[hello, peach, my, cake, tasts, lush]","[hello, peach, cake, tast, lush]",hello peach cake tast lush


## Mapping Text Labels to Numeric Value

In [129]:
df = df[['final_txt','Label']]

df_test = df[['final_txt',"Label"]]

# as our target variable is categorical it is important to convert our categorical varibale to Numeric variable
# Non-Spam --> 0
# SPam --> 1

final_dict = {'Non-Spam':0,'Spam':1}
df['Label'] = df['Label'].map(final_dict)
df_test['Label'] = df_test['Label'].map(final_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


## Converting Emoticons

In [130]:
# Thanks : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

In [131]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text
df['final_txt']=df['final_txt'].apply(lambda x: convert_emoticons(x))
df_test['final_txt']=df_test['final_txt'].apply(lambda x: convert_emoticons(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


## Spell checker

In [132]:
# !!pip install pyspellchecker

In [133]:
# from spellchecker import SpellChecker

# spell = SpellChecker()
# def correct_spellings(text):
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return " ".join(corrected_text)


In [134]:
# df['final_txt']=df['final_txt'].apply(lambda x: correct_spellings(x))
# df_test['final_txt']=df_test['final_txt'].apply(lambda x: correct_spellings(x))

## BAG OF WORDS MODEL


In [135]:
# importing our Bag of Words model from sklearn
from sklearn.feature_extraction.text import CountVectorizer

train_documents_for_bow  = df['final_txt'].tolist()

test_docs = df_test['final_txt'].tolist()

# Create a Vectorizer Object
vectorizer = CountVectorizer(max_features=100)

vectorizer.fit(train_documents_for_bow )

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
X_train = vectorizer.fit_transform(train_documents_for_bow )


# Naive Bayes 
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train.toarray(), df['Label'])


X_test = vectorizer.transform(test_docs)
# Predict Class
y_pred = classifier.predict(X_test.toarray())

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test['Label'].tolist(), y_pred)
print("Accuracy is --> ",accuracy*100)

Vocabulary:  {'call': 13, 'one': 59, 'like': 45, 'xma': 98, 'time': 85, 'tri': 88, 'contact': 22, 'repli': 71, 'phone': 61, 'min': 50, 'line': 46, 'urgent': 92, 'last': 43, 'weekend': 95, 'draw': 26, 'show': 77, '1000': 1, 'cash': 15, 'holiday': 38, 'claim': 16, '150ppm': 3, 'text': 83, 'great': 36, 'stop': 80, 'servic': 76, 'today': 86, 'number': 56, 'select': 74, 'receiv': 70, 'reward': 72, 'pleas': 64, 'code': 18, 'rate': 69, 'want': 93, 'ltgt': 48, 'free': 28, 'msg': 52, 'txt': 89, 'ok': 58, 'take': 81, 'ur': 91, 'mobil': 51, 'www': 97, 'win': 96, 'co': 17, 'uk': 90, 'ye': 99, 'award': 12, 'camera': 14, 'think': 84, 'know': 41, 'latest': 44, 'go': 33, 'im': 40, 'point': 65, 'per': 60, 'com': 20, 'guarante': 37, '100': 0, 'game': 30, '50': 7, 'give': 32, 'prize': 68, 'send': 75, 'pick': 62, 'custom': 23, 'need': 53, 'pl': 63, 'collect': 19, '5000': 8, 'day': 24, '18': 5, '150p': 2, 'sm': 78, 'nokia': 55, 'new': 54, 'get': 31, 'poli': 66, 'tone': 87, 'oh': 57, 'privat': 67, '2003': 6