In [82]:
import numpy as np
import pandas as pd

In [83]:
dt = pd.read_csv('spam1.csv')
dt.pop("Unnamed: 3")
dt.pop("Unnamed: 2")
dt.pop("Unnamed: 4")
dt.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [84]:

dt['spam'] = dt['v1'].map({'spam' : 1, "ham" : 0}).astype(int)
dt.head(5)

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [85]:
#printing the colums in the folder
print("columns in the given table are :")
for i in dt.columns:
    print(i)

columns in the given table are :
v1
v2
spam


In [86]:
#finding the no.of rows in the type and text
t=len(dt['v1'])
print("no.of rows type column",t)
t=len(dt['v2'])
print("no of rows in text column :",t)


no.of rows type column 5572
no of rows in text column : 5572


# from here we convert the whole text to tokens called as tokenisation

In [87]:
dt['v2'][1]

'Ok lar... Joking wif u oni...'

In [88]:
def tokenizer(text):
    return text.split()

In [89]:
dt['v2'] = dt['v2'].apply(tokenizer)

In [90]:
dt['v2'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [91]:
# stemming
#it means that the removing additional suffix or prefixes : all having the same name
#for example waits, waiting,waited are all same sow e apply stemming making all to on word wait
# there are three types of stemmer snowball,porter,landcaster out of which snowball is latest and fast

In [92]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [93]:
def stem_it(v2):
    return [porter.stem(word) for word in v2]


In [94]:
dt['v2']=dt['v2'].apply(stem_it)

In [95]:
dt['v2'][1] #this code is after the stemming

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [96]:
dt['v2'][152]

['ok...', 'ur', 'typic', 'reply...']

In [97]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [98]:
def lemmit_it(v2):
    return [lemmatizer.lemmatize(word,pos="a") for word in v2] #here a denotes adjective in position

In [99]:
dt['v2']=dt['v2'].apply(lemmit_it)

In [100]:
dt['v2'][152]

['ok...', 'ur', 'typic', 'reply...']

# STOP WORD REMOVAL

In [101]:
dt['v2'][216] # this is before stop word removal

['tired.', 'i', "haven't", 'slept', 'well', 'the', 'past', 'few', 'nights.']

In [102]:
#import nltk
#nltk.download('stopwords')


In [103]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [104]:
def stop_it(v2):
    review = [word for word in v2 if not word in stop_words ]
    return review

In [105]:
dt["v2"] = dt["v2"].apply(stop_it)

In [106]:
dt['v2'][216] # afetr stop words removal

['tired.', 'slept', 'well', 'past', 'nights.']

In [107]:
dt.head(15)

Unnamed: 0,v1,v2,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [108]:
dt['v2'] = dt['v2'].apply(' '.join)  #adds up space and joins all the words in it
#string.join()  ==> whatever the string is the join joins the list with that string

In [109]:
dt.head(15)

Unnamed: 0,v1,v2,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0
5,spam,freemsg hey darl 3 week word back! i'd like fu...,1
6,ham,even brother like speak me. treat like aid pat...,0
7,ham,per request mell mell (oru minnaminungint nuru...,0
8,spam,winner!! valu network custom select receivea �...,1
9,spam,mobil 11 month more? u r entitl updat late col...,1


# TIME TO CONVERT TEXT DATA INTO MATRIX

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = dt.spam
X = tfidf.fit_transform(dt['v2'])
X.shape


(5572, 8077)

In [118]:
y.shape

(5572,)

In [119]:
from sklearn.model_selection import train_test_split
X_train,X_text,y_train,y_text = train_test_split(X,y,random_state=1,test_size=0.2)

# Logistic_Regression 

In [121]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_text)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred,y_text)*100
print("it is ",acc_log)

it is  96.50224215246637
