In [28]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
import re
from nltk.tokenize import word_tokenize

## read data set 

In [29]:
data= pd.read_csv('spam.csv.csv', encoding="ISO-8859-1")
data.head(7)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,


# to remove URL 

In [30]:
def clean_url(text1):
    return re.sub(r'http\S+', ' ', text1)
data['ClaenEmails']= data['v2'].apply(clean_url)
text1=data['ClaenEmails'][0:10]

In [31]:
text1

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: ClaenEmails, dtype: object

In [32]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,ClaenEmails
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"Nah I don't think he goes to usf, he lives aro..."
...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?,,,,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s...",,,,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...,,,,The guy did some bitching but I acted like i'd...


# To remove Numbers and Punctuation

In [33]:
def clean_non_alphanumeric(text2):
    return re.sub('[^a-zA-Z]',' ',text2)
data['CleanEmails']=data['v2'].apply(clean_non_alphanumeric)

In [64]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,ClaenEmails,CleanEmails
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy .. Available bugis...
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar... Joking wif u oni...,Ok lar ... Joking wif oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry wkly comp win FA Cup final ...
3,ham,U dun say so early hor... U c already then say...,,,,U dun say so early hor... U c already then say...,dun say early hor ... already say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,"Nah I don't think he goes to usf, he lives aro...",Nah n't think go usf live around though


## remove white space 

In [35]:
def clear_space(text3):
    return re.sub(r'\s+', ' ',text3)
data['CleanEmails'] = data['v2'].apply(clear_space)

text3=data['CleanEmails'][0:10]

In [36]:
text3

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: CleanEmails, dtype: object

## tokenize

In [37]:
import nltk

In [38]:
import nltk

# Download the 'punkt' resource
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
def tokenize(text4):
    return word_tokenize(text4)

data['CleanEmails']=data['CleanEmails'].apply(tokenize)

In [40]:
data['CleanEmails']

0       [Go, until, jurong, point, ,, crazy, .., Avail...
1                [Ok, lar, ..., Joking, wif, u, oni, ...]
2       [Free, entry, in, 2, a, wkly, comp, to, win, F...
3       [U, dun, say, so, early, hor, ..., U, c, alrea...
4       [Nah, I, do, n't, think, he, goes, to, usf, ,,...
                              ...                        
5567    [This, is, the, 2nd, time, we, have, tried, 2,...
5568     [Will, Ì_, b, going, to, esplanade, fr, home, ?]
5569    [Pity, ,, *, was, in, mood, for, that, ., So, ...
5570    [The, guy, did, some, bitching, but, I, acted,...
5571                  [Rofl, ., Its, true, to, its, name]
Name: CleanEmails, Length: 5572, dtype: object

## find stop word

In [41]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## remove stop word

In [43]:
def clean_stopword(token):
    return [item for item in token if item not in stop_words]
data['CleanEmails']=data['CleanEmails'].apply(clean_stopword)

In [44]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


True

In [50]:
from nltk.stem import WordNetLemmatizer 

In [51]:
lem=WordNetLemmatizer()

def clean_lemmatization(token):
    return[lem.lemmatize(word=w,pos='v') for w in token]

data['CleanEmails']=data['CleanEmails'].apply(clean_lemmatization)

In [52]:
data['CleanEmails']

0       [Go, jurong, point, ,, crazy, .., Available, b...
1                [Ok, lar, ..., Joking, wif, u, oni, ...]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3       [U, dun, say, early, hor, ..., U, c, already, ...
4       [Nah, I, n't, think, go, usf, ,, live, around,...
                              ...                        
5567    [This, 2nd, time, try, 2, contact, u., U, å£75...
5568            [Will, Ì_, b, go, esplanade, fr, home, ?]
5569       [Pity, ,, *, mood, ., So, ..., suggestions, ?]
5570    [The, guy, bitch, I, act, like, 'd, interest, ...
5571                           [Rofl, ., Its, true, name]
Name: CleanEmails, Length: 5572, dtype: object

## Remove meaningless characters

In [53]:
CleanEmailText=[]
def clean_lenght(token):
    filtered_tokens=[]
    for i in token:
        if len(i)>=2:
            filtered_tokens.append(i)
    return filtered_tokens
data['CleanEmails']=data['CleanEmails'].apply(clean_lenght)

In [54]:
data['CleanEmails']

0       [Go, jurong, point, crazy, .., Available, bugi...
1                   [Ok, lar, ..., Joking, wif, oni, ...]
2       [Free, entry, wkly, comp, win, FA, Cup, final,...
3          [dun, say, early, hor, ..., already, say, ...]
4        [Nah, n't, think, go, usf, live, around, though]
                              ...                        
5567    [This, 2nd, time, try, contact, u., å£750, Pou...
5568                  [Will, Ì_, go, esplanade, fr, home]
5569                   [Pity, mood, So, ..., suggestions]
5570    [The, guy, bitch, act, like, 'd, interest, buy...
5571                              [Rofl, Its, true, name]
Name: CleanEmails, Length: 5572, dtype: object

## convert list of words back to string

In [56]:
def convert_to_string(listReview):
    return '  '.join(listReview)

data['CleanEmails']=data['CleanEmails'].apply(convert_to_string)

In [57]:
data['CleanEmails'][5]

"FreeMsg  Hey  darling  's  week  's  word  back  'd  like  fun  still  Tb  ok  XxX  std  chgs  send  å£1.50  rcv"

# Question 2 

## Splitting data into training and testing.

In [59]:
Y = data['v2']
X = data['CleanEmails']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3)


## Extract Features using TfidfVectorizer

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
vectorizer.get_feature_names_out()

array(['00', '000', '008704050406', ..., 'ûªve', 'ûï', 'ûò'], dtype=object)

### Build the model

In [62]:
model = svm.SVC()
model.fit(X_train, y_train)
print("Accuracy :", model.score(X_test, y_test))

Accuracy : 0.07954545454545454
