In [14]:
import numpy as np
import pandas as pd
import sklearn

Discourse relations

In [15]:
Conj_Fol = ['but','however','nevertheless','otherwise','yet','still','nonetheless']
Conj_Prev = ['till','until','despite','in spite','though','although']
Conj_Infer = ['therefore','furthermore','consequently','thus','as a result','subsequently','eventually','hence']
Conditionals = ['if']
Strong_Mod = ['might','could','can','would','may']
Weak_Mod = ['should','ought to','need not','shall','will','must']
Neg = ['not','neither','never','no','nor']
A = ['but','however','nevertheless','otherwise','yet','still','nonetheless','till','until','despite','in spite','though','although','therefore',
'furthermore','consequently','thus','as a result','subsequently','eventually','hence','if','might','could','can','would','may','should','ought to','need not','shall','will','must',
'not','neither','never','no','nor']

Neg_window = 5

Applying the discourse algorithm on the tokenised sentence

In [16]:
def discourse(sentence):
  fij = [1]*len(sentence)
  hypij=[0]*len(sentence)
  flipij=[1]*len(sentence)

  for i,word in enumerate(sentence):
    if (word in Conditionals) or (word in Strong_Mod):
      hypij[i]=1

    if (word in Conj_Fol) or (word in Conj_Infer) or (i<=len(sentence)-3 and sentence[i]=='as' and sentence[i+1]=='a' and sentence[i+2]=='result'):
      for k in range(i+1,len(sentence)):
        if (sentence[k] not in A):
          fij[k]+=1

    if (word in Conj_Prev) or (i<=len(sentence)-2 and sentence[i]=='in' and sentence[i+1]=='spite'):
      for k in range(0,i):
        if (sentence[k] not in A):
          fij[k]+=1

    if (word in Neg):
      for k in range(1,Neg_window):
        if (i+k<len(sentence)):
          if (sentence[i+k] in Conj_Prev) or (sentence[i+k] in Conj_Fol):
            break
          else:  
            flipij[i+k]=-1

  return sentence,fij,flipij,hypij

Import the restaurant review dataset

In [17]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

Preprocessing the text

In [27]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
corpus_tokenised=[]
max_length = 0         # maximum length of the sentence in the dataset, required for padding
###
sentence='He was relaxed despite the danger'     # sentence of which sentiment has to be found
###
sentence = sentence.lower()        # lowercase
sentence_tokenised=nltk.word_tokenize(sentence)       # tokenization
sentence = re.sub('[^a-zA-Z]', ' ', sentence)
sentence = sentence.split()
ps = PorterStemmer()
sentence = [ps.stem(word) for word in sentence if not word in set(stopwords.words('english'))]      # removal of stopwords
sentence = ' '.join(sentence)
for i in range(0, 1000):              # the same preprocessing for dataset sentences
    review=dataset['Review'][i]
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review_tokenised=nltk.word_tokenize(review)
    
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    
    max_length=max(max_length,len(review_tokenised))
    corpus_tokenised.append(review_tokenised)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Importing the bag of words vectorization module

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
new_X=[]

appending the discourse features to the sentence vectors

In [20]:
for key,l in enumerate(X):
  wij,fij,flipij,hypij = discourse(corpus_tokenised[key])
  row=X[key].tolist()
  for i in range(max_length):
    if(i<len(wij)):
      row.append(fij[i])
      row.append(flipij[i])
      row.append(hypij[i])
    else:                         #padding with zeros
      row.append(0)
      row.append(0)
      row.append(0)  
  new_X.append(row)    

Splitting into train and test

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size = 0.1, random_state = 0)


Adding few manually labelled sentences containing discourse relations due to unavailability of the discourse dataset

In [22]:
sentence_list_discourse = ['I am quite excited about Tintin despite not really liking the original comics',
                           'If Micromax improved its battery life, it would have been a great product',
                           'My daughter is off school very poorly but brightened up when we saw you on television today',
                           'I do not like this house',
                           'I was feeling happy throughout the day until I saw my marks which made me upset',
                           'Rohan was very angry with his performance in spite of being the winner of the tournament',
                           'You should be ashamed of yourself',
                           'Tomorrow might be a better day for us',
                           'Ram misbehaved with his boss and as a result, he lost his job',
                           'You had no real knowledge and therefore no way to make a wise decision',
                           'He gave his best effort yet failed in completing the task',
                           'Rahul was never rude to anyone',
                           'I was filled with joy until I heard about the sad news of him passing away despite being so young and active']
                        
label_list_discourse = [1,
                        1,
                        1,
                        0,
                        0,
                        0,
                        0,
                        1,
                        0,
                        0,
                        0,
                        1,
                        0]

In [23]:
y_train=y_train.tolist()
for sent in sentence_list_discourse:              #appending the above sentences into to the training set
  rv=cv.transform([sent]).toarray()
  rv=rv.tolist()
  wij,fij,flipij,hypij = discourse(sent)
  for i in range(max_length):
    if(i<len(wij)):
      rv[0].append(fij[i])
      rv[0].append(flipij[i])
      rv[0].append(hypij[i])
    else:
      rv[0].append(0)
      rv[0].append(0)
      rv[0].append(0)
  X_train.append(rv[0])

for label in label_list_discourse:
  y_train.append(label) 


Using SVM to train the model

In [24]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
classifier_linear = svm.SVC(kernel='linear',probability=True)
t0 = time.time()
classifier_linear.fit(X_train, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_test)
prob = classifier_linear.predict_proba(X_test)
#print(prob)
#print(prediction_linear)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

Training time: 7.235210s; Prediction time: 0.252825s
positive:  {'precision': 0.7450980392156863, 'recall': 0.7755102040816326, 'f1-score': 0.76, 'support': 49}
negative:  {'precision': 0.7755102040816326, 'recall': 0.7450980392156863, 'f1-score': 0.76, 'support': 51}


In [28]:
review_vector = cv.transform([sentence]).toarray()                        #vectorising the test sentence
review_vector=review_vector.tolist()
wij,fij,flipij,hypij = discourse(sentence_tokenised)                      # adding discourse features
for i in range(max_length):
    if(i<len(wij)):
      review_vector[0].append(fij[i])
      review_vector[0].append(flipij[i])
      review_vector[0].append(hypij[i])
    else:
      review_vector[0].append(0)
      review_vector[0].append(0)
      review_vector[0].append(0)

In [29]:
print(classifier_linear.predict(review_vector))                       # predicting the sentiment. 0 is negative and 1 is positive
print(classifier_linear.predict_proba(review_vector))                 # probability for each prediction. The first element depicts probabiltity for negative sentiment

[0]
[[0.50758284 0.49241716]]
