In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import random

In [4]:
class Sentiment:
  negative="NEGATIVE"
  positive="POSITIVE"
class Review:
  def __init__(self,text,score):
    self.text=text
    self.score=score
    self.sentiment=self.get_sentiment()
  def get_sentiment(self):
    if self.score<3:
      return Sentiment.negative
    else:#when score is 3 or 4 or 5
      return Sentiment.positive
class ReviewContainer:
  def __init__(self,reviews):
    self.reviews=reviews
  def evenly_distibute(self):
    negative=list(filter(lambda x:x.sentiment==Sentiment.negative,self.reviews))
    positive=list(filter(lambda x:x.sentiment==Sentiment.positive,self.reviews))
    #as data contains more positive data sets,we will reduce them into number of negative one's for better accuracy
    positive_shrunk=positive[:len(negative)]
    self.reviews=negative+positive_shrunk
    random.shuffle(self.reviews)

In [5]:
reviews=[]
with open("rev.json") as f:


  for line in f:
    review_json=json.loads(line)
    review=Review(review_json['reviewText'], review_json['overall'])
    reviews.append(review)

In [6]:
reviews[0].text
reviews[0].score
reviews[0].sentiment

'POSITIVE'

## Train test split

In [7]:
from sklearn.model_selection import train_test_split
training,testing=train_test_split(reviews,test_size=0.33)

In [8]:

len(training)

6700

In [9]:
#evenly distributing data for better accuracies
traincontainer=ReviewContainer(training)
traincontainer.evenly_distibute()
testcontainer=ReviewContainer(testing)
testcontainer.evenly_distibute()

In [10]:

#splitting data
x_train=[x.text for x in traincontainer.reviews]
x_test=[x.text for x in testcontainer.reviews]
y_train=[y.sentiment for y in traincontainer.reviews]
y_test=[y.sentiment for y in testcontainer.reviews]

In [11]:
y_train.count(Sentiment.positive)
y_train.count(Sentiment.negative)

451

## Vectorization

In [12]:
#here we can also use count vectorizer but ,tfidf vectorizer best suits the problem
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
x_train_vector=vectorizer.fit_transform(x_train)
x_test_vector=vectorizer.transform(x_test)

## Accuracy and f1 score for SVC 

In [13]:

from sklearn.svm import SVC
clf_svm=SVC()
clf_svm.fit(x_train_vector,y_train)
y_pred=clf_svm.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_svm_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))

0.7953367875647669
[0.79265092 0.79795396]


## Accuracy and f1 score for SVC

In [14]:
from sklearn.tree import DecisionTreeClassifier
clf_dec=DecisionTreeClassifier()
clf_dec.fit(x_train_vector,y_train)
y_pred=clf_dec.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_dec_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))

0.6373056994818653
[0.64467005 0.62962963]


## Accuracy and f1 score for LogisticRegression

In [15]:
from sklearn.linear_model import LogisticRegression
clf_log=LogisticRegression()
clf_log.fit(x_train_vector,y_train)
y_pred=clf_log.predict(x_test_vector)

from sklearn.metrics import accuracy_score,f1_score
clf_log_score=accuracy_score(y_test,y_pred)
print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred,average=None,labels=[Sentiment.positive,Sentiment.negative]))

0.810880829015544
[0.80839895 0.81329923]


## Prediction

In [94]:
#predicting few random samples whether positive or negative
x =clf_log.predict(vectorizer.transform(["this is execellent","execellent","awesome","worst one","highly prefferable","tasty","the food is good and the toilets are bad. The managment is good. There are good discounts."]))
print(x)
len(x)

['POSITIVE' 'POSITIVE' 'POSITIVE' 'NEGATIVE' 'POSITIVE' 'POSITIVE'
 'POSITIVE']


7

## Devolping 5 - star Review on Restaurants

In [95]:
y = x.reshape(-1,1)
print (y)

[['POSITIVE']
 ['POSITIVE']
 ['POSITIVE']
 ['NEGATIVE']
 ['POSITIVE']
 ['POSITIVE']
 ['POSITIVE']]


In [78]:
print(y[1])


['POSITIVE']


In [79]:
j = 0
k = []
for i in range(0,len(x)):
        if(y[i] != ['NEGATIVE']):
            k = y[i]
            print (k)
            j = j +1
            
            
        i = i+1
print(j)

['POSITIVE']
['POSITIVE']
['POSITIVE']
['POSITIVE']
['POSITIVE']
5


In [80]:
rating = accuracy_score(y_test,y_pred)*(100*j)/(len(x)*20)
print(rating.round(1))

3.4
