**Importing Libraries**

In [0]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score

import re
import seaborn as sns
import matplotlib.pyplot as plt
import logging

In [15]:
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/thesis/original_data')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


**Reading train and test data**

In [0]:
train=pd.read_csv('./data_train_clean.tsv',delimiter='\t',encoding='utf-8',index_col=[0])

In [17]:
train.shape

(161291, 2)

In [18]:
train.head()

Unnamed: 0,review,rating
0,no side effect take combination bystolic mg fi...,1
1,son halfway fourth week intuniv become concern...,1
2,use take another oral contraceptive pill cycle...,0
3,first time use form birth control glad go patc...,1
4,suboxone completely turn life around feel heal...,1


In [0]:
test=pd.read_csv('./data_test_clean.tsv',delimiter='\t',encoding='utf-8',index_col=[0])

In [20]:
test.shape

(53764, 2)

In [23]:
test.head()

Unnamed: 0,review,rating
0,try antidepressant year citalopram fluoxetine ...,1
1,son crohn disease well asacol no complaint sho...,1
2,quick reduction symptom,1
3,contrave combine drug use alcohol smoking opio...,1
4,birth control one cycle read review type simil...,1


**TFIDF feature vector**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=2, max_features=100000)

In [0]:
X_train= train['review']
y_train= train['rating']
X_test = test['review']
y_test= test['rating']

In [0]:
train_tv = tfidf_vectorizer.fit_transform(X_train)
test_tv = tfidf_vectorizer.transform(X_test)

In [29]:
train_tv

<161291x27618 sparse matrix of type '<class 'numpy.float64'>'
	with 5620125 stored elements in Compressed Sparse Row format>

In [30]:
test_tv

<53764x27618 sparse matrix of type '<class 'numpy.float64'>'
	with 1865340 stored elements in Compressed Sparse Row format>

**Importing and training Logistic Regression with TFIDF feature and evaluating accuracy on test data**

In [32]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5 ,max_iter=100)
logreg = logreg.fit(train_tv, y_train)
y_pred = logreg.predict(test_tv)
print('accuracy %s' % accuracy_score(y_test,y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test,y_pred, average='weighted')))



accuracy 0.8350755152146417
Testing F1 score: 0.8336465464951974




**Importing and training SVM with TFIDF feature**

In [85]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(train_tv, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [0]:
#Predict the response for test dataset
y_pred = clf.predict(test_tv)

**Printing accuracy of SVM with TFIDF feature on test data**

In [87]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.837158693549587
Testing F1 score: 0.8333496619893974
