In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import json
import random

## Preparing data

### Getting data

In [2]:
# enum class to prevent floating strings
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

# Record positive, negative or neutral from overall rating
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
        
    def get_sentiment(self): 
        return [x.sentiment for x in self.reviews]
                
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        
        # Shuffle order
        random.shuffle(self.reviews)
#         print('jjjy', len(self.reviews))
        

In [3]:
j_file = 'sklearn-master/data/sentiment/Books_small_10000.json'

# Put comment and rating in a reviews list
reviews = []
with open(j_file) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        

### Split to train set and test set

In [4]:
### Split to train set and test set

from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)


In [5]:
# split training tuple in training list
train_container.evenly_distribute()

# The text/comments
train_x = train_container.get_text()
# The category/sentiment
train_y = train_container.get_sentiment()

# Same Thing
test_container.evenly_distribute()
test_x = train_container.get_text()
test_y = train_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


### Converting text values to vectors

In [6]:
# Bag of words
# print(test_x)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

## Classification

### Linear SVM

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
# clf_gnb.fit(train_x_vectors, train_y)

# clf_gnb.predict(test_x_vectors[0])

### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])



array(['NEGATIVE'], dtype='<U8')

## Evaluation

### Mean Accuracy

In [11]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))
# print(clf_gnb.score(test_x_vectors, test_y))


1.0
1.0
0.9988532110091743


### F1 scores

In [12]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors)
         , average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors)
         , average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(test_y, clf_log.predict(test_x_vectors)
         , average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

[1. 1. 0.]
[1. 1. 0.]
[0.99885452 0.99885189 0.        ]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [14]:
test_y.count(Sentiment.POSITIVE)

436

In [31]:
new_test_set = ['Thanks for building me, it was great', 'Thumbs up', 'Stan, you suck', 'My first model is really stupid',
                'good stuff', 'Needs some upgrades, but it is great with book related coments']


new_test = vectorizer.transform(new_test_set)

print(clf_svm.predict(new_test))
# print(clf_log.predict(new_test))

['NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'POSITIVE']
