# Import the libaries

In [1]:
import numpy as np
import pandas as pd

# Load the dataset

In [2]:
dt = pd.read_csv('SpamData.csv', encoding= 'ISO-8859-1')
dt.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
len(dt)    #there are total 116 data in the spam dataset.

116

# Map the Spam and ham text as 1 & 0

In [4]:
dt['spam']= dt['type'].map({'spam' :1, 'ham' : 0}).astype(int)
dt.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# No. of columns in Datasets

In [5]:
print('Number of columns in the dataset: ')
for col in dt.columns:
    print(col)

Number of columns in the dataset: 
type
text
spam


# Number of rows in review and liked columns

In [6]:
t = len(dt['type'])
print('Number of rows in review columns :',t)
t = len(dt['text'])
print('Number of rows in liked columns:',t)

Number of rows in review columns : 116
Number of rows in liked columns: 116


# Tokenization

In [7]:
dt['text'][1]      #we can take any of the rows, here we took 1.

'Ok lar... Joking wif u oni...'

In [8]:
def tokenizer(text):
    return text.split()

In [9]:
dt['text']= dt['text'].apply(tokenizer)

In [10]:
dt['text'][1]     #all the words has been converted into tokens

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

# Stemming

In [11]:
dt['text'][1]     #in stemming the suffix of the word will be removed. In this we have taken rows 1 data. The word joking will be changed into joke

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [12]:

from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english',ignore_stopwords= False)


In [13]:

def stem_it(text):
    return [porter.stem(word) for word in text]

In [14]:
dt['text'] = dt['text'].apply(stem_it)

In [15]:
dt['text'][1]       #the word joking has been changed into joke after stemming

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

# Lemmitization

In [16]:
dt['text'][109]

['i',
 'know!',
 'grumpi',
 'old',
 'people.',
 'my',
 'mom',
 'was',
 'like',
 'you',
 'better',
 'not',
 'be',
 'lying.',
 'then',
 'again',
 'i',
 'am',
 'alway',
 'the',
 'one',
 'to',
 'play',
 'jokes...']

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [18]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word, pos='a') for word in text]


In [19]:
dt['text'] = dt['text'].apply(lemmit_it)

In [20]:
dt['text'][109]     #the initial word better has been changed into 'good'

['i',
 'know!',
 'grumpi',
 'old',
 'people.',
 'my',
 'mom',
 'was',
 'like',
 'you',
 'good',
 'not',
 'be',
 'lying.',
 'then',
 'again',
 'i',
 'am',
 'alway',
 'the',
 'one',
 'to',
 'play',
 'jokes...']

# Stopwords Removal

In [21]:
dt['text'][111]   #in stopwords removal useless words which are not that much imformative is being removed (word like a,an,the etc)

['what', 'is', 'the', 'plural', 'of', 'the', 'noun', 'research?']

In [22]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [23]:
def stop_it(text):
    review = [word for word in text if not word in stop_words]
    return review

In [24]:
dt['text'] = dt['text'].apply(stop_it)

In [25]:
dt['text'][111]     #after removing the useless words

['plural', 'noun', 'research?']

# Join the words

In [26]:
dt['text'] = dt['text'].apply(''.join)

In [27]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"gojurongpoint,crazy..availonlibugingreatworldl...",0
1,ham,oklar...jokewifuoni...,0
2,spam,freeentri2wklicompwinfacupfinaltkts21stmay2005...,1
3,ham,udunsayearlihor...ucalreadisay...,0
4,ham,"nahthinkgoeusf,livearoundthough",0


# Vectorization (Transform text data into TDF/TF-IDF Vectors)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = dt.spam.values
x = tfidf.fit_transform(dt['text'])

In [32]:
from sklearn.model_selection import train_test_split
x_train, x_text, y_train, y_text = train_test_split(x,y,random_state=1, test_size = 0.2, shuffle = False)

# Classification using logistic regression

In [34]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_text)

from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_text)*100
print('Accuracuy:', acc_log)                     #since our dataset is small , so we are getting this much accuracy. With big dataset we can achieve more accuracy

Accuracuy: 87.5


# Classification using LinearSVC Accuracy

In [35]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train,y_train)
y_pred = linear_svc.predict(x_text)
acc_linear_svc = accuracy_score(y_pred, y_text)*100
print('accuracy:', acc_linear_svc)

accuracy: 87.5


                                                    ---XXX----