In [29]:
import numpy as np
import pandas as pd
import sklearn

Import the restaurant review dataset

In [30]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

Preprocessing the text

In [31]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
corpus_tokenised=[] 
###
sentence='He was relaxed despite the danger'   # sentence of which sentiment has to be found
###
sentence = sentence.lower()       # lowercase
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
sentence = sentence.split()
ps = PorterStemmer()              # stemming
sentence = [ps.stem(word) for word in sentence if not word in set(stopwords.words('english'))]        # removal of stopwords
sentence = ' '.join(sentence)
for i in range(0, 1000):          # the same preprocessing for dataset sentences
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Importing the bag of words vectorization module

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

Splitting into train and test

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)


Adding few manually labelled sentences containing discourse relations due to unavailability of the discourse dataset

In [34]:
sentence_list_discourse = ['I am quite excited about Tintin despite not really liking the original comics',
                           'If Micromax improved its battery life, it would have been a great product',
                           'My daughter is off school very poorly but brightened up when we saw you on television today',
                           'I do not like this house',
                           'I was feeling happy throughout the day until I saw my marks which made me upset',
                           'Rohan was very angry with his performance in spite of being the winner of the tournament',
                           'You should be ashamed of yourself',
                           'Tomorrow might be a better day for us',
                           'Ram misbehaved with his boss and as a result, he lost his job',
                           'You had no real knowledge and therefore no way to make a wise decision',
                           'He gave his best effort yet failed in completing the task',
                           'Rahul was never rude to anyone',
                           'I was filled with joy until I heard about the sad news of him passing away despite being so young and active']
                        
label_list_discourse = [1,
                        1,
                        1,
                        0,
                        0,
                        0,
                        0,
                        1,
                        0,
                        0,
                        0,
                        1,
                        0]

In [35]:
y_train=y_train.tolist()
X_train = X_train.tolist()
for sent in sentence_list_discourse:    #appending the above sentences into to the training set
  rv=cv.transform([sent]).toarray()
  
  X_train.append(rv[0])

for label in label_list_discourse:
  y_train.append(label) 


Using SVM to train the model

In [36]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
classifier_linear = svm.SVC(kernel='linear',probability=True)
t0 = time.time()
classifier_linear.fit(X_train, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_test)
prob = classifier_linear.predict_proba(X_test)
#print(prob)
#print(prediction_linear)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

Training time: 8.091170s; Prediction time: 0.269671s
positive:  {'precision': 0.7307692307692307, 'recall': 0.7755102040816326, 'f1-score': 0.7524752475247524, 'support': 49}
negative:  {'precision': 0.7708333333333334, 'recall': 0.7254901960784313, 'f1-score': 0.7474747474747475, 'support': 51}


In [37]:
review_vector = cv.transform([sentence]).toarray()   # vectorising the test sentence

In [38]:
print(classifier_linear.predict(review_vector))      # predicting the sentiment. 0 is negative and 1 is positive
print(classifier_linear.predict_proba(review_vector)) # probability for each prediction. The first element depicts probabiltity for negative sentiment

[0]
[[0.60087977 0.39912023]]
