## Loading Libraries


In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #<-- to vectorize bag of words

## Data Class


In [2]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE


## Load Data

In [3]:
file_name = './data/sentiments/books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[5].sentiment)
        

POSITIVE


## Prep Data

In [4]:
training, test = train_test_split(reviews, test_size = 0.33, random_state = 42) #random state will give the same split
                                                                # when random_state = 42 it will give the same result you got when you ran it the first time
training[0].text

"Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down."

In [5]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

train_x[0]

"Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down."

#### Bag of words vectorization

In [6]:
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) #vectorize the train_x

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x) <-- can do the same thing fit_transform but in two steps

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
[[0 0 0 ... 0 0 0]]


## Classification

### Linear svm

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

In [13]:
clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

In [14]:
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')