# This particular Notebook is about using nlp techniques to classify if a message recieved is spam or not using a support vector classifier(SVC).
## The text is preprocessed using three different vectorizers
## 1: count vectorizer
## 2: TF-IDF vectorizer
## 3: hashing vectorizer

In [13]:
# importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import GridSearchCV
import wordcloud
import string
import nltk
from nltk.corpus import stopwords


In [14]:
dataset = pd.read_csv('/content/spam (1).csv', encoding = 'latin-1')

In [15]:
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [16]:
dataset = dataset.iloc[:,0:-3]

In [17]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
dataset.rename(columns = {'v1': 'Target', 'v2' : 'Message'}, inplace=True)

In [19]:
dataset.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
le = LabelEncoder()
dataset['Target'] = le.fit_transform(dataset['Target'])

In [21]:
dataset.head()

Unnamed: 0,Target,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
# seperating X and y variables
X = dataset['Message']
y = dataset['Target']


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model 1 using count vectorize

In [24]:
count_vector = CountVectorizer(lowercase=True)
extracted_features = count_vector.fit_transform(X_train)

In [25]:
params = {'kernel' : ['rbf', 'linear'],
          'gamma' : [1e-2, 1e-4],
          'C' : [1, 10, 100, 1000]}

model = GridSearchCV(SVC(), params)
model.fit(extracted_features, y_train)

In [26]:
print('Accuracy:', model.score(count_vector.transform(X_test), y_test)*100)

Accuracy: 98.11659192825111


# Model 2 using Tf-Idf vectorizer

In [27]:
tf_idf = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1))
extracted_features = tf_idf.fit_transform(X_train)

In [None]:
params = {'kernel' : ['rbf', 'linear'],
          'gamma' : [1e-2, 1e-4],
          'C' : [1, 10, 100, 1000]}

model = GridSearchCV(SVC(), params)
model.fit(extracted_features, y_train)

In [None]:
print('Accuracy:', model.score(tf_idf.transform(X_test), y_test)*100)

# Model 3 using hashing vectorizer

In [None]:
hashing = HashingVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1), analyzer='word')
extracted_features = hashing.fit_transform(X_train)

In [None]:
extracted_features.shape

In [None]:
params = {'kernel' : ['rbf', 'linear'],
          'gamma' : [1e-2, 1e-4],
          'C' : [1, 10, 100, 1000]}

model = GridSearchCV(SVC(), params)
model.fit(extracted_features, y_train)

In [None]:
print('Accuracy:', model.score(hashing.transform(X_test), y_test)*100)