# Import Libraries

In [4]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import re
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
nltk.download('stopwords')
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing Data

In [5]:
def preprocess_data(train_data):
    stopword = stopwords.words('english')
    corpus = np.empty(len(train_data), dtype=object)
    ps = PorterStemmer()
    # print(data)
    print("=====================")
    for i in range(len(train_data)):
        # print(i)
        # print(train_data[i])
        text = train_data[i,1]
        # print(type(text)," ",text)
        # text = np.array_str(text)
        # print(type(text))

        # keeping only words in the review text
        review = re.sub('[^a-zA-Z]', ' ', text)
        # #convert everything to lower  case
        review = review.lower()
        # #split it into words
        review = review.split()
        # #remove stopwords and do stemming
        review = [ps.stem(word) for word in review if not word in stopword]
        # #the words are joined together to form a sentence
        review = ' '.join(review)
        # #the cleaned text is appended to corpus
        corpus[i] = review
        # corpus.append(review)
    return corpus


# Training

In [7]:
import re
def train(path):
    
    data1 = pd.read_csv(path)
    data = data1.to_numpy()
    train_labels = data[1:15000,-1]
    train_data = data[1:15000,:-1]
    test_labels = data[15000:,-1]
    test_data = data[15000:,:-1]
    
    corpus = []

    corpus_train_data = preprocess_data(train_data)
    corpus_test_data = preprocess_data(test_data)
    

    termidf = TfidfVectorizer(lowercase=False)
    tfidf_train_data = termidf.fit_transform(corpus_train_data)
    
    tfidf_test_data = termidf.transform(corpus_test_data)
    print("============================================")
    print()
    print()
   
    print("KERNEL =   linear")
    svc = SVC(kernel = "linear")
    svc.fit(tfidf_train_data, train_labels)

    y_pred = svc.predict(tfidf_test_data)
    
    print("Printing Accuracy Score")
    
    print("SVC using TFIDF and linear kernel ",accuracy_score(test_labels,y_pred))
    
    
    print("Printing F1 Score")
    print(f1_score(test_labels, y_pred, average='macro'))
    
    
    print("Printing COnfusion Matrix")
    print(confusion_matrix(test_labels, y_pred))

   
    print("============================================")
    print()
    print()
    print("KERNEKL = poly")
    svc = SVC(kernel = "poly", gamma = 'auto')
    svc.fit(tfidf_train_data, train_labels)

    y_pred = svc.predict(tfidf_test_data)
    print("Printing Accuracy Score")
    
    print("SVC using TFIDF and poly kernel ",accuracy_score(test_labels,y_pred))
    
    print("Printing F1 Score")
    print(f1_score(test_labels, y_pred, average='macro'))
    
    
    print("Printing COnfusion Matrix")
    print(confusion_matrix(test_labels, y_pred))
    
    
    print("============================================")
    print()
    print()
    print("KERNEL = rbf")
    svc = SVC(kernel = "rbf", gamma = 'auto')
    svc.fit(tfidf_train_data, train_labels)

    y_pred = svc.predict(tfidf_test_data)
    print("Printing Accuracy Score")
   
    print("SVC using TFIDF and rbf kernel",accuracy_score(test_labels,y_pred))
    
    print("Printing F1 Score")
    print(f1_score(test_labels, y_pred, average='macro'))
    
    
    print("Printing COnfusion Matrix")
    print(confusion_matrix(test_labels, y_pred))
    
    print("============================================")
    print()
    print()
    
    
    print("KERNEL = sigmoid")
    svc = SVC(kernel = "sigmoid", gamma = 'auto')
    svc.fit(tfidf_train_data, train_labels)
    print("Printing Accuracy Score")

    y_pred = svc.predict(tfidf_test_data)
    print("SVC using TFIDF and sigmoid kernel",accuracy_score(test_labels,y_pred))
    
    print("Printing F1 Score")
    print(f1_score(test_labels, y_pred, average='macro'))
    
    
    print("Printing COnfusion Matrix")
    print(confusion_matrix(test_labels, y_pred))






train('/home/ubuntu/Desktop/sem2/smai/assignment2/Datasets/Question-5/Train.csv')




KERNEL =   linear
Printing Accuracy Score
SVC using TFIDF and linear kernel  0.8099547511312217
Printing F1 Score
0.807764436921372
Printing COnfusion Matrix
[[227  17  13]
 [ 36 136   8]
 [ 37  15 174]]


KERNEKL = poly
Printing Accuracy Score
SVC using TFIDF and poly kernel  0.38763197586727
Printing F1 Score
0.186231884057971
Printing COnfusion Matrix
[[257   0   0]
 [180   0   0]
 [226   0   0]]


KERNEL = rbf


  'precision', 'predicted', average, warn_for)


Printing Accuracy Score
SVC using TFIDF and rbf kernel 0.38763197586727
Printing F1 Score
0.186231884057971
Printing COnfusion Matrix
[[257   0   0]
 [180   0   0]
 [226   0   0]]


KERNEL = sigmoid
Printing Accuracy Score
SVC using TFIDF and sigmoid kernel 0.38763197586727
Printing F1 Score
0.186231884057971
Printing COnfusion Matrix
[[257   0   0]
 [180   0   0]
 [226   0   0]]
