## Loading Libraries


In [22]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #<-- to vectorize bag of words

## Data Class


In [2]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE


## Load Data

In [3]:
file_name = './data/sentiments/books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[5].sentiment)
        

POSITIVE


## Prep Data

In [16]:
training, test = train_test_split(reviews, test_size = 0.33, random_state = 42) #random state will give the same split

training[0].text

"Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down."

In [21]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

train_x[0]

"Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down."

#### Bag of words vectorization

In [26]:
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) #vectorize the train_x

print(train_x[0])
print(train_x_vectors[0])

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
  (0, 7086)	1
  (0, 1148)	1
  (0, 350)	2
  (0, 1800)	1
  (0, 6595)	1
  (0, 562)	1
  (0, 3054)	1
  (0, 1558)	1
  (0, 6475)	1
  (0, 6593)	1
  (0, 2895)	1
  (0, 7353)	1
  (0, 539)	1
  (0, 1515)	1
  (0, 5197)	1
  (0, 3545)	1
  (0, 2007)	1
