In [1]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from warnings import filterwarnings as fw
fw('ignore')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import nltk
from string import punctuation
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
data = pd.read_csv('SMSSpamCollection.tsv', sep = '\t', names = ['type', 'msg'])
data

Unnamed: 0,type,msg
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
...,...,...
5563,spam,This is the 2nd time we have tried 2 contact u...
5564,ham,Will ü b going to esplanade fr home?
5565,ham,"Pity, * was in mood for that. So...any other s..."
5566,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    5568 non-null   object
 1   msg     5568 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
data.describe().T

Unnamed: 0,count,unique,top,freq
type,5568,2,ham,4822
msg,5568,5165,"Sorry, I'll call later",30


In [5]:
data.groupby('type').describe()

Unnamed: 0_level_0,msg,msg,msg,msg
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4822,4513,"Sorry, I'll call later",30
spam,746,652,Please call our customer service representativ...,4


In [6]:
for i in data['msg'][data['type']=='ham']:
    print(i)
    print('-')

I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.
-
Nah I don't think he goes to usf, he lives around here though
-
Even my brother is not like to speak with me. They treat me like aids patent.
-
I HAVE A DATE ON SUNDAY WITH WILL!!
-
As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
-
I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.
-
Oh k...i'm watching here:)
-
Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.
-
Fine if thats the way u feel. Thats the way its gota b
-
Is that seriously how you spell his name?
-
I‘m going to try for 2 months ha ha only joking
-
So ü pay first lar... Then when is da stock comin...
-
Aft i finish my lunch t

In [7]:
data['length'] = data['msg'].str.len()
data.sample(5)

Unnamed: 0,type,msg,length
1149,ham,Ok i go change also...,22
4328,ham,Nope i'll come online now..,27
3811,ham,Can. Dunno wat to get 4 her...,30
3435,ham,Its good to hear from you,25
3915,ham,No need to ke qi... Ü too bored izzit y sudden...,63


In [8]:
data.groupby('type').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4822.0,71.487764,58.451963,2.0,33.0,52.0,93.0,910.0
spam,746.0,138.659517,28.891361,13.0,133.0,149.0,157.0,223.0


In [9]:
data['word_count'] = data['msg'].str.split().str.len()
data.sample(5)

Unnamed: 0,type,msg,length,word_count
3952,ham,I knew it... U slept v late yest? Wake up so l...,52,12
2386,ham,Boo. How's things? I'm back at home and a litt...,66,13
956,ham,Where @,7,2
644,spam,PRIVATE! Your 2003 Account Statement for shows...,136,20
5430,ham,Jesus armand really is trying to tell everybod...,59,11


In [10]:
data.groupby('type').describe().T

Unnamed: 0,type,ham,spam
length,count,4822.0,746.0
length,mean,71.487764,138.659517
length,std,58.451963,28.891361
length,min,2.0,13.0
length,25%,33.0,133.0
length,50%,52.0,149.0
length,75%,93.0,157.0
length,max,910.0,223.0
word_count,count,4822.0,746.0
word_count,mean,14.311489,23.900804


In [11]:
data['msg'][(data['word_count'] == 2) & (data['type'] == 'spam')]

3738         2/2 146tf150p
3977    ringtoneking 84484
Name: msg, dtype: object

In [12]:
mm = data['msg'][data['length'] == max(data['length'])].values[0]
mm

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

In [13]:
data['msg'][data['length'] > 100]

0       I've been searching for the right words to tha...
1       Free entry in 2 a wkly comp to win FA Cup fina...
5       As per your request 'Melle Melle (Oru Minnamin...
6       WINNER!! As a valued network customer you have...
7       Had your mobile 11 months or more? U R entitle...
                              ...                        
5551    Yeh. Indians was nice. Tho it did kane me off ...
5553    No. I meant the calculation is the same. That ...
5562    REMINDER FROM O2: To get 2.50 pounds free call...
5563    This is the 2nd time we have tried 2 contact u...
5566    The guy did some bitching but I acted like i'd...
Name: msg, Length: 1763, dtype: object

In [14]:
m7 = data['msg'][7]
m7

'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'

In [15]:
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

In [16]:
sw = ["i've", "i'll", "we'll", "ve"]

In [47]:
def text_process(t):
    all_words = []
    t = t.replace('.', '. ').replace('!', '! ').replace('?', '? ')
    for sent in nltk.sent_tokenize(t.lower()):
        words = nltk.word_tokenize(sent)
        words = [word for word in words if (word not in stopwords.words('english')) and (word not in sw)]
#         print(words)
#         words = [lemma.lemmatize(word, wordnet.ADJ) for word in words]
        words = [stemmer.stem(word) for word in words]
        words = [word for word in words if word not in punctuation]
#         print(words)
        words = [word for word in words if not word.isnumeric()]
        all_words += words
#     print(all_words)
    all_words = ''.join([ch for ch in ' '.join(all_words) if (ch not in punctuation) and (not ch.isnumeric())])
    return all_words

In [18]:
m7

'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'

In [19]:
text7 = text_process(m7)
print(text7)

mobil month u r entitl updat latest colour mobil camera free call mobil updat co free
mobil month u r entitl updat latest colour mobil camera free call mobil updat co free


In [20]:
print(mm)
print()
textm = text_process(mm)
print(textm)
# . ke baad space nahi h to i alag nahi horha 
# to . ke baad space laga det hain

For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later..

love start attract feel need everi time around first thing come thought would start day 

In [21]:
data['msg'].head(4).apply(text_process)

ve search right word thank breather promis wont take help grant fulfil promis wonder bless time
free entri wkli comp win fa cup final tkt st may  text fa receiv entri question std txt rate c s appli over s
nah nt think goe usf live around though
even brother like speak treat like aid patent


0    ve search right word thank breather promis won...
1    free entri wkli comp win fa cup final tkt st m...
2              nah nt think goe usf live around though
3        even brother like speak treat like aid patent
Name: msg, dtype: object

In [22]:
"i've" in stopwords.words('english')

False

In [23]:
# i've stopwords me nahi h haww

In [48]:
tfidfV = TfidfVectorizer()
tf_idf = tfidfV.fit_transform(data['msg'].apply(text_process))

In [25]:
tf_df = pd.DataFrame(tf_idf.toarray(), columns = tfidfV.get_feature_names())
tf_df.head()

Unnamed: 0,aa,aah,aaniy,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zero,zf,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,èn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
tf_df.columns[0].isnumeric()

False

In [27]:
tfidfV.get_feature_names()

['aa',
 'aah',
 'aaniy',
 'aaooooright',
 'aathi',
 'ab',
 'abbey',
 'abdomen',
 'abeg',
 'abel',
 'aberdeen',
 'abi',
 'abil',
 'abiola',
 'abj',
 'abl',
 'abnorm',
 'abouta',
 'abroad',
 'absenc',
 'absolut',
 'absolutli',
 'abstract',
 'abt',
 'abta',
 'aburo',
 'abus',
 'ac',
 'academ',
 'acc',
 'accent',
 'accentur',
 'accept',
 'access',
 'accid',
 'accident',
 'accommod',
 'accommodationvouch',
 'accomod',
 'accordin',
 'accordingli',
 'account',
 'accumul',
 'ach',
 'achan',
 'achiev',
 'acid',
 'acknowledg',
 'aclpm',
 'acnt',
 'aco',
 'across',
 'acsmsreward',
 'act',
 'actin',
 'action',
 'activ',
 'actor',
 'actual',
 'acubootydeli',
 'acugoldvik',
 'acuhmmross',
 'acunat',
 'acunataliek',
 'acwicmbcktzr',
 'ad',
 'adam',
 'add',
 'addamsfa',
 'addi',
 'addict',
 'address',
 'adewal',
 'adi',
 'adjust',
 'admin',
 'administr',
 'admir',
 'admiss',
 'admit',
 'ador',
 'adp',
 'adress',
 'adrian',
 'adrink',
 'adsens',
 'adult',
 'advanc',
 'adventur',
 'advic',
 'advis',
 'a

In [28]:
tf_df.shape

(5568, 6534)

In [29]:
# whenever dealing with text data use naive bayes

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
xtrain, xtest, ytrain, ytest = train_test_split(tf_df, data['type'], test_size = 0.25, random_state = 0)

In [32]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [33]:
model_bnb = BernoulliNB().fit(xtrain, ytrain)
print(model_bnb.score(xtrain, ytrain))
print(model_bnb.score(xtest, ytest))

0.9846743295019157
0.9734195402298851


In [34]:
model_bnb = MultinomialNB().fit(xtrain, ytrain)
print(model_bnb.score(xtrain, ytrain))
print(model_bnb.score(xtest, ytest))

0.9755747126436781
0.9640804597701149


In [35]:
# next project
# web scraping -> some mobiles reviews 
# not .text -> we have find out attributes to go on a particular link
# anchor tag ka attribute h href
# instead of .text we have to use attributes

In [36]:
# collect reviews and stars
# try to build a system to predict ki usko product kesa laga
# sirf review daalta h to hamko predict krna h ki neg reviw tha ya pos rating tha
# atleast 10 pages ka

In [37]:
xtrainy, xtesty, ytrainy, ytesty = train_test_split(data['msg'], data['type'], test_size = 0.25, random_state = 0)

In [38]:
from sklearn.pipeline import Pipeline

In [39]:
ddd =map(text_process, data['msg'])

In [40]:
TfidfVectorizer().fit_transform(ddd)

ve search right word thank breather promis wont take help grant fulfil promis wonder bless time
free entri wkli comp win fa cup final tkt st may  text fa receiv entri question std txt rate c s appli over s
nah nt think goe usf live around though
even brother like speak treat like aid patent
date sunday
per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun
winner valu network custom select receivea £ prize reward claim call  claim code kl valid hour
mobil month u r entitl updat latest colour mobil camera free call mobil updat co free
m gon na home soon nt want talk stuff anymor tonight k ve cri enough today
six chanc win cash  pound txt csh send  cost pday day  tsandc appli repli hl info
urgent week free membership £ prize jackpot txt word claim c www dbuk net lccltd pobox ldnwarw
xxxmobilemovieclub use credit click wap link next txt messag click http wap xxxmobilemovieclub com nqjkgighjjgcbl
oh k m watch
eh u rememb spell name ye v nau

<5568x6534 sparse matrix of type '<class 'numpy.float64'>'
	with 45746 stored elements in Compressed Sparse Row format>

In [41]:
model_pipe = Pipeline([
    ('bow', CountVectorizer(analyzer = text_process)),
    ('tfidf', TfidfTransformer()),
    ('clf', BernoulliNB())
])

In [42]:
model_pipe.fit(xtrainy, ytrainy)

sure result offer
lor
free messag activ free text messag repli messag word free term condit visit www  com
actual first time went bed long spoke woke  night
sent like
stop club tone repli  stop mix  see myton comenjoy html term club tone cost gbp week mfl po box mk wt 
k need login anyth
aiyar dun disturb u liao thk u lot aft ur cupboard come
shall call dear food
u re welcom caught u use broken english
pleas reserv ticket saturday eve chennai thirunelvali tirunelvali chennai sunday eve alreadi see net ticket avail want book ticket tackl
jay say re doublefaggot
lol mad first woke gave
year mani mile
allo brave buse taken train triumph mean ‘ b ‘ ham jolli good rest week
yo howz u girl never rang india l
p alfi moon s children need song ur mob tell ur m txt tone chariti nokia poli chariti poli zed profit chariti
oh fuck juswok bed boatin dock slept wid year old spinout giv u da gossip lr xxx
ha would nt say nt read anyth way u seem nt like judgement save friday pub
sm auction nokia i get

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x0000028F34BE4C10>)),
                ('tfidf', TfidfTransformer()), ('clf', BernoulliNB())])

In [43]:
model_pipe.score(xtrainy, ytrainy)

sure result offer
lor
free messag activ free text messag repli messag word free term condit visit www  com
actual first time went bed long spoke woke  night
sent like
stop club tone repli  stop mix  see myton comenjoy html term club tone cost gbp week mfl po box mk wt 
k need login anyth
aiyar dun disturb u liao thk u lot aft ur cupboard come
shall call dear food
u re welcom caught u use broken english
pleas reserv ticket saturday eve chennai thirunelvali tirunelvali chennai sunday eve alreadi see net ticket avail want book ticket tackl
jay say re doublefaggot
lol mad first woke gave
year mani mile
allo brave buse taken train triumph mean ‘ b ‘ ham jolli good rest week
yo howz u girl never rang india l
p alfi moon s children need song ur mob tell ur m txt tone chariti nokia poli chariti poli zed profit chariti
oh fuck juswok bed boatin dock slept wid year old spinout giv u da gossip lr xxx
ha would nt say nt read anyth way u seem nt like judgement save friday pub
sm auction nokia i get

0.8366858237547893

In [44]:
model_pipe.score(xtesty, ytesty)

bank granit issu strongbuy explos pick member nasdaq symbol cdgt per
purchas stuff today mail po box number
hey book kb sat alreadi lesson go ah keep sat night free need meet confirm lodg
hey leav friday wait ask superior tell
m happi babe woo hoo parti dude
tell bad charact u dnt lik ll tri chang lt gt add tat new year resolut wait ur repli frank good morn
werear free give otherwis nalla adi entey nattil kittum
pleas call immedi urgent messag wait
s said s bad dat e gal know u wat u
read shame tell take run blame u ever realli ever long time
mean come chase state watch mani movi want
congratul thank good friend u £ xma prize claim easi call p per minut btnationalr
love holiday monday feel even go dentist hour
lol ok ill tri send warn sprint dead slow ll prolli get tomorrow
lmao s fish memori need
freemsg award free mini digit camera repli snap collect prize quizclub opt stop pwk sp rwm ph
check room befor activ
urgent nd attempt contact u £ prize yesterday still await collect claim ca

0.860632183908046

In [45]:
ypredy = model_pipe.predict(xtesty)

bank granit issu strongbuy explos pick member nasdaq symbol cdgt per
purchas stuff today mail po box number
hey book kb sat alreadi lesson go ah keep sat night free need meet confirm lodg
hey leav friday wait ask superior tell
m happi babe woo hoo parti dude
tell bad charact u dnt lik ll tri chang lt gt add tat new year resolut wait ur repli frank good morn
werear free give otherwis nalla adi entey nattil kittum
pleas call immedi urgent messag wait
s said s bad dat e gal know u wat u
read shame tell take run blame u ever realli ever long time
mean come chase state watch mani movi want
congratul thank good friend u £ xma prize claim easi call p per minut btnationalr
love holiday monday feel even go dentist hour
lol ok ill tri send warn sprint dead slow ll prolli get tomorrow
lmao s fish memori need
freemsg award free mini digit camera repli snap collect prize quizclub opt stop pwk sp rwm ph
check room befor activ
urgent nd attempt contact u £ prize yesterday still await collect claim ca

In [46]:
print(classification_report(ytesty, ypredy))

              precision    recall  f1-score   support

         ham       0.97      0.86      0.91      1190
        spam       0.51      0.86      0.64       202

    accuracy                           0.86      1392
   macro avg       0.74      0.86      0.78      1392
weighted avg       0.91      0.86      0.87      1392

