[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/prabuscihero/NLP-Basic-to-Bert/blob/master/LinearSvm_Glovefeatures.ipynb)

Model - Linear SVM with Glove Features

In [0]:
from sklearn.svm import LinearSVC
import pandas as pd
from sklearn.pipeline import Pipeline
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
import numpy as np
from torchtext.vocab import GloVe
import pickle
from sklearn.model_selection import GridSearchCV

In [3]:
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [4]:
cd '/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert'

/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert


In [0]:
# Load the dataset
train_df = pd.read_csv('training_data.csv')
test_df = pd.read_csv('testing_data.csv')

In [0]:
### Download the glove embedding 
glove = GloVe(name = '6B', dim = 300)
np.save('embed.npy', glove.vectors.numpy())
pickle.dump(glove.stoi, open('stoi.pkl', 'wb'))

In [0]:
#### Load the embeddings and corresponding word indics
embeddings = np.load('embed.npy')
word_to_indx = pickle.load(open('stoi.pkl', 'rb'))

In [0]:
### Find the number of words and remove review with zero words
train_df['length']= train_df.user_review.str.split().apply(len)
test_df['length'] = test_df.user_review.str.split().apply(len)
train_df=train_df[train_df.length!=0]
test_df = test_df[test_df.length!=0]

In [0]:
# Create embbedding featues by averaging the embedding values of all the words in the review

train_df['user_review'] = train_df.user_review.str.lower()
train_df['emb_feature'] = train_df['user_review'].apply(lambda x:np.array([embeddings[word_to_indx.get(word, 0)] for word in x.split()]).mean(0))

test_df['user_review'] = test_df.user_review.str.lower()
test_df['emb_feature'] = test_df['user_review'].apply(lambda x:np.array([embeddings[word_to_indx.get(word, 0)] for word in x.split()]).mean(0))

In [0]:
# Convert them to np array for optimization
train_features = np.stack(train_df['emb_feature'])
np.save('train_features.npy', train_features)

test_features = np.stack(test_df['emb_feature'])
np.save('test_features.npy', test_features)

train_class = train_df['user_rating']
test_class = test_df['user_rating']

np.save('train_class.npy',train_class)
np.save('test_class.npy',test_class)


In [0]:
train_features = np.load('train_features.npy')
test_features = np.load('test_features.npy')
train_class = np.load('train_class.npy')
test_class = np.load('test_class.npy')

In [13]:
# Find the best parameters using Grid Search
model =LinearSVC()
parameters = {'C': [0.01,0.1,1,10,100]}
grid_search = GridSearchCV(LinearSVC(), parameters, n_jobs=-1)
grid_search.fit(train_features, train_class)

print('best parameters: ', grid_search.best_params_)



best parameters:  {'C': 10}




In [14]:
# Build the model using the best parameter

model = LinearSVC(C= 10)
model.fit(train_features,train_class)



LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [15]:
# Find the Classification report for the model

prediction=model.predict(test_features)
print(classification_report(test_class, prediction))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83     24626
           1       0.84      0.83      0.83     25374

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000

