In [21]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import ssl
import heapq
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samuelwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samuelwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
training = pd.read_json('Grocery_and_Gourmet_Food_Reviews_training.json', lines=True)

In [5]:
textOnly = training[['overall', 'reviewText']]

In [6]:
textOnly.dropna();

In [7]:
textOnlyRatings = textOnly['overall']
del textOnly['overall']

In [8]:
# textOnly is a dataframe with only text of review
# textOnlyRatings is a dataframe with only rating of review.
# Each index in textOnly corresponds to the same index in textOnlyRatings

In [9]:
textOnly.values.flatten()

array(["Can't wait to get cooking with these. I have heard about capers and had never tasted them. However aster tasting them I can't wait to get experimenting with some different recipes.",
       'Large and good tasting', 'wonderful', ...,
       "Quinine bitterness not the least bit detectable. In fact not bitter even when drunk straight up.. Overwhelming sourness from all the citric acid. May make a decent albeit pricey whiskey sour. Will neither buy again nor recommend it to anyone. Grocery store tonics are far far better at a small fraction of the price. C'est la vie!",
       "i guess i'm so used to regular tonic, this did not impress me that much, even though it gets great reviews. thought i'd try it at least once.",
       "Maybe this works when added to a drink, but I wanted to use it to make tonic water and the result tasted so bad when I added it to some soda water I made on my Sodastream that I couldn't drink it!  The remainder of the bottle of concentrate went down the dr

In [10]:
dataset = textOnly.values.flatten()[:100000] 
for i in range(len(dataset)): 
    dataset[i] = str(dataset[i]).lower() 
    dataset[i] = re.sub(r'\W', ' ', dataset[i]) 
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

In [11]:
print(dataset)

['can t wait to get cooking with these i have heard about capers and had never tasted them however aster tasting them i can t wait to get experimenting with some different recipes '
 'large and good tasting' 'wonderful' ...
 'great never had these before now i eat them when i need a chocolate fix as they are big oh yeah and for baking too he he great nestle flavor yummy'
 'love dark chocolate morsels right out of the package in my cookies cakes melted for decorating '
 'absolutely delicious makes the best chocolate chip cookies my order came in perfect shape no issues some commenters said their order arrived melted mine did not it s worth noting these chips are a bit larger than the regular semi sweet chips which makes them a bit better if you ask me ']


In [12]:
word2count = {} 
stop = stopwords.words('english')
for data in dataset: 
    words = nltk.word_tokenize(data) 
    for word in words: 
        if word not in stop:
            if word not in word2count.keys(): 
                word2count[word] = 1
            else: 
                word2count[word] += 1

In [13]:
print(word2count)



In [14]:
freq_words = heapq.nlargest(100, word2count, key=word2count.get)
# 100 says that we should only pay attention to the top 100 most frequent words. Change as youd like

In [15]:
# create word count vector for each review
X = [] 
for data in dataset: 
    vector = [] 
    for word in freq_words: 
        if word in nltk.word_tokenize(data): 
            vector.append(1) 
        else: 
            vector.append(0) 
    X.append(vector) 
X = np.asarray(X) 

In [50]:
print(X)

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
# store awesome or not for each review within Y
Y = []
scores = textOnlyRatings.values.flatten()[:100000] 
for score in scores:
    Y.append(score > 4.4) # awesome or not is determined by whether score is greater than 4.4
Y = np.asarray(Y)
df = pd.DataFrame(X) 
print("Number of samples: {} Number of Features: {}".format(df.shape[0], df.shape[1]))

Number of samples: 100000 Number of Features: 100


In [31]:
# split the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [33]:
# create model, fit, and score
model = GaussianNB()
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.6904242424242424