# ***Q5. Support Vector Machine***

In [0]:
from google.colab import drive 
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from scipy.spatial import distance
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
import string
from sklearn import svm
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix



In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### **Reading Data from CSV**

In [0]:
ds1=pd.read_csv("/content/drive/My Drive/Datasets/Question-5/Train(1).csv")
ds1 = ds1.sample(frac=1).reset_index(drop=True)
ds1

Unnamed: 0.1,Unnamed: 0,text,author
0,13192,"""Readily; I have solved others of an abstrusen...",EAP
1,10787,"Therein, he has dreamed of flames and suffocat...",EAP
2,14743,I was too much exhausted to reflect upon this ...,EAP
3,14275,I found that the berries were spoiled by this ...,MWS
4,3917,"""Am I for ever,"" he cried, ""to be the sport of...",MWS
...,...,...,...
15658,3163,"Upon Zann the effect was terrible, for droppin...",HPL
15659,2077,"The visiter, shooting suddenly into this bay f...",EAP
15660,14123,"The expression of his smile, however, was by n...",EAP
15661,2047,The next January gossips were mildly intereste...,HPL


# **Data Preprocessing and Cleaning**

**Removing numbers and punctuations:**



Numbers:- 12,160  or 160M doesn't cary any particular inormation so we have to remove it

Punctuation:-All the punctuation marks according to the priorities should be dealt with. For example: “.”, “,”,”?” are important punctuations that should be retained while others need to be removed.



In [0]:
corpus=ds1.iloc[:,1].to_numpy()


for i in range(len(corpus)):

  regex = re.compile('[^a-zA-Z]')
  corpus[i]=regex.sub(' ', corpus[i])
  txt = ''.join(corpus[i])
  corpus[i]=txt
        
corpus


array([' Readily  I have solved others of an abstruseness ten thousand times greater ',
       'Therein  he has dreamed of flames and suffocation of mountains upon mountains of Pelion upon Ossa ',
       'I was too much exhausted to reflect upon this circumstance  but ate and drank with avidity ',
       ...,
       'The expression of his smile  however  was by no means unpleasing  as might be supposed  but it had no variation whatever ',
       'The next January gossips were mildly interested in the fact that  Lavinny s black brat  had commenced to talk  and at the age of only eleven months ',
       'I sympathized with and partly understood them  but I was unformed in mind  I was dependent on none and related to none '],
      dtype=object)

**Removing small words :-**

A small word with length less than or equal to 3 does not carry any significance information so we can drop that words have lenght less than 3.

In [0]:
for i in range(len(corpus)):
  
  txt1=corpus[i].split(' ')
  txt=""
  
  for j in txt1:
      if(len(j)>3):
          txt+=" "+j
  corpus[i]=txt

corpus
  

array([' Readily have solved others abstruseness thousand times greater',
       ' Therein dreamed flames suffocation mountains upon mountains Pelion upon Ossa',
       ' much exhausted reflect upon this circumstance drank with avidity',
       ...,
       ' expression smile however means unpleasing might supposed variation whatever',
       ' next January gossips were mildly interested fact that Lavinny black brat commenced talk only eleven months',
       ' sympathized with partly understood them unformed mind dependent none related none'],
      dtype=object)

**Stemming:-**

The goal of both stemming is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. With that being said, stemming helps us reduce the number of overall terms to certain “root” terms.

For example:
Organizer, organizes, organization, organized all these get reduced to a root term, maybe “organiz”.

In [0]:
for i in range(len(corpus)):

  stemmer= PorterStemmer()

  txt1=word_tokenize(corpus[i])
  txt=""
  for word in txt1:
      txt+=" "+stemmer.stem(word)

  corpus[i]=txt
corpus

array([' readili have solv other abstrus thousand time greater',
       ' therein dream flame suffoc mountain upon mountain pelion upon ossa',
       ' much exhaust reflect upon thi circumst drank with avid', ...,
       ' express smile howev mean unpleas might suppos variat whatev',
       ' next januari gossip were mildli interest fact that lavinni black brat commenc talk onli eleven month',
       ' sympath with partli understood them unform mind depend none relat none'],
      dtype=object)

**Removing stop words:-**

When data analysis needs to be data driven at the word level, the commonly occurring words (stop-words) should be removed. One can either create a long list of stop-words or one can use predefined language specific libraries.

In [0]:
my_stop_words = text.ENGLISH_STOP_WORDS
print(my_stop_words)

frozenset({'anywhere', 'along', 'bill', 'herein', 'out', 'those', 'hundred', 'behind', 'how', 'it', 'five', 'every', 'at', 'meanwhile', 'might', 'ie', 'while', 'con', 'after', 'one', 'themselves', 'the', 'thick', 'himself', 'some', 'between', 'done', 'itself', 'who', 'almost', 'with', 'former', 'hence', 'fill', 'had', 'many', 'herself', 'well', 'here', 'so', 'myself', 'third', 'even', 'though', 'whoever', 'amount', 'amoungst', 'have', 'nine', 'others', 'besides', 'latter', 'system', 'us', 'enough', 'do', 'somewhere', 'were', 'has', 'under', 'sometime', 'be', 'been', 'beside', 'mill', 'from', 'noone', 'not', 'often', 'take', 'cannot', 'everywhere', 'around', 'ltd', 'rather', 'hereupon', 'very', 'whereupon', 'see', 'their', 're', 'fifteen', 'above', 'few', 'elsewhere', 'front', 'per', 'will', 'becoming', 'due', 'his', 'cant', 'amongst', 'forty', 'nobody', 'find', 'her', 'on', 'eight', 'for', 'below', 'most', 'nowhere', 'except', 'by', 'give', 'him', 'thereby', 'he', 'show', 'thin', 'must

# **Vectorizer**

**TfidfVectorizer:-**

It will transform the text into the feature vectors and used as input to the estimator.

The vocabulary is the dictionary that will convert each token or word in the matrix and it will get the feature index.

In [0]:
vectorizer = TfidfVectorizer(stop_words=my_stop_words)
X = vectorizer.fit_transform(corpus)
features=vectorizer.get_feature_names()
X=X.toarray()
print(len(features),features)
print(X,X.shape)

13798 ['aaem', 'aback', 'abaft', 'abandon', 'abaout', 'abas', 'abash', 'abat', 'abbey', 'abbrevi', 'abdic', 'abdomen', 'abdul', 'abernethi', 'aberr', 'abey', 'abhor', 'abhorr', 'abid', 'abigail', 'abijah', 'abil', 'abject', 'abjur', 'abl', 'ablaz', 'abli', 'abnorm', 'aboard', 'abod', 'abolish', 'abomin', 'abort', 'abound', 'abov', 'abreast', 'abroad', 'abrupt', 'abruptli', 'abscond', 'absenc', 'absent', 'absolut', 'absolv', 'absorb', 'absorbingli', 'abstain', 'abstemi', 'abstract', 'abstractli', 'abstrus', 'absurd', 'absurdli', 'absurdum', 'abund', 'abundantli', 'abus', 'abut', 'abxut', 'abysm', 'abyss', 'academi', 'accaount', 'acceler', 'accent', 'accentu', 'accept', 'access', 'accessori', 'accid', 'accident', 'acclam', 'accliv', 'accommod', 'accompani', 'accomplic', 'accomplish', 'accord', 'accordingli', 'accost', 'account', 'accoutr', 'accredit', 'accru', 'acct', 'accumul', 'accur', 'accuraci', 'accurs', 'accursedli', 'accus', 'accustom', 'acet', 'acetylen', 'ach', 'acheron', 'acher

# **PCA**

In [0]:
from sklearn.decomposition import PCA
pca = PCA(n_components=500)
X=pca.fit_transform(X)

# **Test and Validation**

In [0]:
print(ds1.iloc[:12500,2])

train_data=X[:12500]
test_data=X[12500:]

train_labels=ds1.iloc[:12500,2]
test_labels=ds1.iloc[12500:,2]


print("Train data shape:- ",train_data.shape)
print("Train labels shape:- ",train_labels.shape)

print("Test data shape:- ",test_data.shape)
print("Test labels shape:- ",test_labels.shape)


0        EAP
1        EAP
2        EAP
3        MWS
4        MWS
        ... 
12495    MWS
12496    MWS
12497    EAP
12498    MWS
12499    EAP
Name: author, Length: 12500, dtype: object
Train data shape:-  (12500, 500)
Train labels shape:-  (12500,)
Test data shape:-  (3163, 500)
Test labels shape:-  (3163,)


# **SVM**

SVM is a supervised machine learning algorithm which can be used for classification or regression problems. It uses a technique called the kernel trick to transform your data and then based on these transformations it finds an optimal boundary between the possible outputs. Simply put, it does some extremely complex data transformations, then figures out how to seperate your data based on the labels or outputs you've defined.

## **Kernel rbf**

In [0]:
clf = svm.SVC()
clf.fit(train_data, train_labels)
prediction=clf.predict(test_data)
prediction

array(['EAP', 'EAP', 'HPL', ..., 'EAP', 'HPL', 'MWS'], dtype=object)

In [0]:
a_c=accuracy_score(test_labels,prediction)
f_1=f1_score(test_labels, prediction, average=None)
c_m=confusion_matrix(test_labels, prediction)
print("accuracy score -> ", a_c)
print("\n\nF1 socre -> ", f_1)
print("\n\nConfusion Matrix ->\n", c_m)

accuracy score ->  0.7372747391716724


F1 socre ->  [0.75267284 0.72576832 0.72428884]


Confusion Matrix ->
 [[1056  103  126]
 [ 224  614   75]
 [ 241   62  662]]


## **Linear kernel**

**C=1**

In [0]:
clf = svm.SVC(kernel='linear',C=1)
clf.fit(train_data, train_labels)
prediction=clf.predict(test_data)
prediction

array(['EAP', 'EAP', 'HPL', ..., 'EAP', 'HPL', 'MWS'], dtype=object)

In [0]:
a_c=accuracy_score(test_labels,prediction)
f_1=f1_score(test_labels, prediction, average=None)
c_m=confusion_matrix(test_labels, prediction)
print("accuracy score -> ", a_c)
print("\n\nF1 socre -> ", f_1)
print("\n\nConfusion Matrix ->\n", c_m)

accuracy score ->  0.710085361998103


F1 socre ->  [0.73080931 0.6827262  0.7025788 ]


Confusion Matrix ->
 [[1052  129  104]
 [ 269  581   63]
 [ 273   79  613]]


**C=10**

In [0]:
clf = svm.SVC(kernel='linear',C=10)
clf.fit(train_data, train_labels)
prediction=clf.predict(test_data)
prediction

array(['EAP', 'EAP', 'HPL', ..., 'EAP', 'HPL', 'MWS'], dtype=object)

In [0]:
a_c=accuracy_score(test_labels,prediction)
f_1=f1_score(test_labels, prediction, average=None)
c_m=confusion_matrix(test_labels, prediction)
print("accuracy score -> ", a_c)
print("\n\nF1 socre -> ", f_1)
print("\n\nConfusion Matrix ->\n", c_m)

accuracy score ->  0.7243123616819476


F1 socre ->  [0.73281882 0.71356218 0.72210066]


Confusion Matrix ->
 [[997 155 133]
 [209 634  70]
 [230  75 660]]


## **Kernel Polynomial**

In [0]:
clf = svm.SVC(kernel='poly',C=1)
clf.fit(train_data, train_labels)
prediction=clf.predict(test_data)
prediction

array(['EAP', 'EAP', 'EAP', ..., 'EAP', 'EAP', 'EAP'], dtype=object)

In [0]:
a_c=accuracy_score(test_labels,prediction)
f_1=f1_score(test_labels, prediction, average=None)
c_m=confusion_matrix(test_labels, prediction)
print("accuracy score -> ", a_c)
print("\n\nF1 socre -> ", f_1)
print("\n\nConfusion Matrix ->\n", c_m)

accuracy score ->  0.5165981662978185


F1 socre ->  [0.61619073 0.32642916 0.40092166]


Confusion Matrix ->
 [[1176   54   55]
 [ 695  197   21]
 [ 661   43  261]]
