# Load in the necessary libraries

In [76]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

### Data Class

In [54]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
class Review:
    
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

### Load Data

In [55]:
file_name = "Downloads/Books_small.json"

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"],review["overall"]))
        
reviews[5].sentiment


'POSITIVE'

### Prep Data

In [64]:
train,test = train_test_split(reviews, test_size=0.33, random_state=42)

In [69]:
train_x = [x.text for x in train]
train_y = [y.sentiment for y in train]

test_x = [x.text for x in test]
test_y = [y.sentiment for y in test]

train_y[10]

'POSITIVE'

### Bag of word vectorization

In [86]:
vectorizer = CountVectorizer()
train_x_vector = vectorizer.fit_transform(train_x)
test_x_vector = vectorizer.transform(test_x)

### Classsification

#### Linear SVM

In [121]:
from sklearn import svm


clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(train_x_vector, train_y)

test_x[100]

clf_svm.predict(test_x_vector[100])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [122]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier(random_state=0)
clf_dec.fit(train_x_vector, train_y)

test_x[100]

clf_dec.predict(test_x_vector[100])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [123]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vector.toarray(), train_y)

test_x[100]

clf_gnb.predict(test_x_vector[100].toarray())

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [124]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(random_state=0)

clf_log.fit(train_x_vector, train_y)

test_x[200]

clf_log.predict(test_x_vector[200])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

### Evaluation

In [132]:
# Mean accuracy
print(clf_svm.score(test_x_vector,test_y))
print(clf_dec.score(test_x_vector,test_y))
print(clf_gnb.score(test_x_vector.toarray(),test_y))
print(clf_log.score(test_x_vector,test_y))

0.8242424242424242
0.7696969696969697
0.8121212121212121
0.8303030303030303


In [143]:
# F1 scores
from sklearn.metrics import f1_score

print(f1_score(test_y,clf_svm.predict(test_x_vector), average=None))
print(f1_score(test_y,clf_dec.predict(test_x_vector), average=None))
print(f1_score(test_y,clf_gnb.predict(test_x_vector.toarray()), average=None))
print(f1_score(test_y,clf_log.predict(test_x_vector), average=None))

[0.22222222 0.21052632 0.91319444]
[0.06896552 0.10169492 0.87412587]
[0.09090909 0.08510638 0.89678511]
[0.1        0.12244898 0.91370558]
