In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from sklearn.decomposition import TruncatedSVD
import gensim
from gensim.models import KeyedVectors, Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import class_weight

In [21]:
link = 'https://raw.githubusercontent.com/python-for-ml/sent-analysis/master/finaldataset.csv'
data = pd.read_csv(link)
data=data.drop(['title_x','title_y','title','value'],axis=1)

In [22]:
le = LabelEncoder()
data['tag'] = le.fit_transform(data['tag'])

In [23]:
data['tag']=data['tag'].replace(3, 0) #Positive Class
data['tag']=data['tag'].replace(4, 1) #Negative Class
data = data[data['tag'] != 2]#Dropping the neutral class

In [24]:
d = data['data'].tolist()
label = data['tag'].tolist()

In [25]:
for i in range(len(d)):
    d[i] = re.sub(r'[^\w\s]', '', d[i])

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Lakshaya
[nltk_data]     Karthikeyan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
corpus = []
for i in range(len(d)):
    corpus.append(word_tokenize(d[i]))

In [39]:
#Creating word embedding for the words. Embedding dimension = 100
wvmodel = Word2Vec(corpus, size=664, window=3, min_count=1)

In [40]:
size = 664

In [41]:
#Creating the input data
X = np.zeros((len(corpus),size)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
    emb = np.zeros((1,size))
    for w in corpus[i]:
        emb = emb +  wvmodel[w]
    X[i] = emb 

  


In [42]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X,label, test_size = 0.20, random_state = 42)

In [43]:
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train),y_train)
cw = {0:class_weights[0],1:class_weights[1]}




In [49]:
clf = SVC(class_weight = cw, kernel = 'rbf', C = 0.85)
clf.fit(X_train, y_train)

SVC(C=0.85, class_weight={0: 1.443779108449767, 1: 0.764892492069087})

In [50]:
y_pred = clf.predict(X_test)

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.35      0.39      0.37       719
           1       0.68      0.65      0.66      1452

    accuracy                           0.56      2171
   macro avg       0.52      0.52      0.52      2171
weighted avg       0.57      0.56      0.57      2171



In [52]:
print("Accuracy  :",accuracy_score(y_test, y_pred))
print("Precision :",precision_score(y_test,y_pred))
print("F1-Score  :",f1_score(y_test,y_pred))
print("Recall    :",recall_score(y_test,y_pred))

Accuracy  : 0.5619530170428374
Precision : 0.6808664259927798
F1-Score  : 0.6647867465632711
Recall    : 0.6494490358126722


In [53]:
confusion_matrix(y_test, y_pred)

array([[277, 442],
       [509, 943]], dtype=int64)