<a href="https://colab.research.google.com/github/subhash-ranjan/sentiment-classifier/blob/main/ReviewsClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Class

In [36]:
import random

class Sentiment:
  NEGATIVE = 'NEGATIVE'
  NEUTRAL = 'NEUTRAL'
  POSITIVE = 'POSITIVE'

class Review:
  def __init__(self, text, score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()

  def get_sentiment(self):
    if self.score <3:
      return Sentiment.NEGATIVE
    elif self.score == 3:
      return Sentiment.NEUTRAL
    else:
      return Sentiment.POSITIVE

class ReviewContainer:
  def __init__(self,reviews):
    self.reviews = reviews

  def get_text(self):
    return [x.text  for x in self.reviews]

  def get_sentiment(self):
    return [x.sentiment  for x in self.reviews]

  def evenly_distribute(self):
    negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
    positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
    neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
    positive_new= positive[:len(negative)]
    self.reviews = negative + positive_new + neutral

    random.shuffle(self.reviews)


In [18]:
import json 

file_name = 'amazon-data.json'

reviews =[]
with open(file_name) as f:
  count =0

  for line in f:
    # count=count+1
    # if (count>1000):
    #   break

    review = json.loads(line)
    reviews.append(Review(review['reviewText'], review['overall']))


10000


Prepare data

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

training, test = train_test_split(reviews, test_size=0.33, random_state=41)

training_container = ReviewContainer(training)
test_container = ReviewContainer(test)

training_container.evenly_distribute()
test_container.evenly_distribute()

train_x = training_container.get_text()
train_y = training_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()


# Bag of words vectorization
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# print(train_x_vectors[0].toarray())


## Classification

#### Liner SVM

In [50]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[236])

        

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [42]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[236])


array(['POSITIVE'], dtype='<U8')

#### Naive Bais

In [44]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

#  clf_gnb.predict(test_x_vectors.toarray()[236])


GaussianNB()

#### Logistic Regession

In [45]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression( max_iter=500)
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[236])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [51]:
# Mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

# F1 Scores
from sklearn.metrics import f1_score
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])


0.6504854368932039
0.4563106796116505
0.5284327323162274


array([0.65927978, 0.68032787, 0.57879656])

In [54]:
# Evaluate

test_new = ['this is great', 'not that good','this is good','very bad', 'this is worse']
new_data = vectorizer.transform(test_new)
clf_svm.predict(new_data)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'],
      dtype='<U8')

Tuning the model further

In [58]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rbf'), 'C':(1,4,8,16)}
# clf = GridSearchCV(svm.SVC(), parameters, cv=5)
# clf.fit(train_x_vectors, train_y)
# print(clf.score(test_x_vectors, test_y))



Save/Load model

In [59]:
import pickle

with open('/sentiment_classifier.pkl','wb') as f:
  pickle.dump(clf, f)

In [74]:
with open('/sentiment_classifier.pkl','rb') as f:
  loaded_model = pickle.load(f)

print(loaded_model.predict(test_x_vectors[39]))


['POSITIVE']
