**Importing Libraries**

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score

import re
import seaborn as sns
import matplotlib.pyplot as plt
import logging

In [2]:
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/thesis/original_data')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


**Reading train and test data**

In [0]:
train=pd.read_csv('./data_train_clean.tsv',delimiter='\t',encoding='utf-8',index_col=[0])

In [4]:
train.tail()

Unnamed: 0,review,rating
161292,write first report mid october not alcohol sin...,1
161293,give iv surgey immediately become anxious coul...,0
161294,limited improvement month develop bad rash md ...,0
161295,thyroid medication year spend first synthroid ...,1
161296,chronic constipation adult life try linzess wo...,1


In [0]:
test=pd.read_csv('./data_test_clean.tsv',delimiter='\t',encoding='utf-8',index_col=[0])

In [7]:
test['rating'].nunique()

2

In [8]:
train.shape


(161291, 2)

In [9]:
test.shape

(53764, 2)

**Downloading pretrained Google news word vector**

In [18]:
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2019-08-06 19:12:03--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.106.174
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.106.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2019-08-06 19:12:52 (32.1 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



**Importing and initializing gensim implentation of word2vec**

In [0]:
from gensim.models import Word2Vec
wv = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
wv.init_sims(replace=True)

**Function for averaging word vector**

In [0]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])


In [0]:
!pip install nltk
import nltk
nltk.download('all')

**Tokenization of reviews and creating feature vector**

In [0]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    


test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['review']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['review']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

**Importing and training Logistic Regression with word2vec feature**

In [0]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, train['rating'])
y_pred = logreg.predict(X_test_word_average)


**Predicting Score of Logistic Regression with Word2vec**

In [16]:
print('accuracy %s' % accuracy_score(test.rating,y_pred))
print('Testing F1 score: {}'.format(f1_score(test.rating,y_pred, average='weighted')))


accuracy 0.7791272970761104
Testing F1 score: 0.7663903671071017


**Importing and training SVM with word2vec feature**

In [18]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train_word_average, train['rating'])



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [0]:
#Predict the response for test dataset
y_pred = clf.predict(X_test_word_average)

**Predicting Score of SVM with Word2vec on test data**

In [20]:
print('Testing accuracy %s' % accuracy_score(test['rating'], y_pred))
print('Testing F1 score: {}'.format(f1_score(test['rating'], y_pred, average='weighted')))

Testing accuracy 0.7776951119708355
Testing F1 score: 0.7600319621939203
