<a href="https://colab.research.google.com/github/subhash-ranjan/sentiment-classifier/blob/main/ReviewsClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Class

In [1]:
class Sentiment:
  NEGATIVE = 'NEGATIVE'
  NEUTRAL = 'NEUTRAL'
  POSITIVE = 'POSITIVE'

class Review:
  def __init__(self, text, score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()

  def get_sentiment(self):
    if self.score <3:
      return Sentiment.NEGATIVE
    elif self.score == 3:
      return Sentiment.NEUTRAL
    else:
      return Sentiment.POSITIVE


In [5]:
import json 

file_name = 'amazon-data.json'

reviews =[]
with open(file_name) as f:
  count =0

  for line in f:
    count=count+1
    if (count>1000):
      break

    review = json.loads(line)
    reviews.append(Review(review['reviewText'], review['overall']))

  # print(len(reviews))


Prepare data

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

training, test = train_test_split(reviews, test_size=0.33, random_state=41)

train_x = [x.text  for x in training]
train_y = [x.sentiment  for x in training]

test_x = [x.text  for x in test]
test_y = [x.sentiment  for x in test]


# Bag of words vectorization
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# print(train_x_vectors[0].toarray())
print(len(train_x))


670


## Classification

#### Liner SVM

In [47]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[236])



array(['NEGATIVE'], dtype='<U8')

#### Decision Tree

In [48]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[236])


array(['POSITIVE'], dtype='<U8')

#### Naive Bais

In [55]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

# clf_gnb.predict(test_x_vectors.toarray()[236])


GaussianNB()

#### Logistic Regession

In [59]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression( max_iter=500)
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[236])

array(['NEGATIVE'], dtype='<U8')

## Evaluation

In [66]:
# Mean Accuracy

# print(clf_svm.score(test_x_vectors, test_y))
# print(clf_dec.score(test_x_vectors, test_y))
# print(clf_log.score(test_x_vectors, test_y))

# F1 Scores
from sklearn.metrics import f1_score
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])


array([0.89774697, 0.18867925, 0.13333333])