In [2]:
import codecs
import pandas as pd
import itertools as it

from pandas import DataFrame
import os

In [3]:
stars_filepath = 'review_stars_rest_subset.txt'
review_txt_filepath = 'review_text_rest_subset.txt'

In [7]:
def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

In [6]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [5]:
def get_stars(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(stars_filepath),
                          review_number, review_number+1))[0]

In [8]:
get_stars(0).replace('\n', '')

'4'

In [9]:
get_sample_review(8)

"Came here for a burger as one of my friends said they had good burgers and people from work would come by twice a month for a burger run.  Ordered the original was was ok, wasn't as amazing as everyone says it to be. I think the veggie option would have been better.  A little pricy for a take-out burger. \r\n"

In [10]:
with open(stars_filepath) as f:
    stars = f.readlines()
# remove whitespace characters like `\n` at the end of each line
stars = [x.strip() for x in stars]

In [11]:
len(stars)

1570963

In [12]:
with open(review_txt_filepath) as f:
    texts = f.readlines()
# remove whitespace characters like `\n` at the end of each line
texts = [x.strip() for x in texts]

In [13]:
len(texts)

1570963

In [14]:
from collections import Counter
 
def balance_classes(xs, ys):
    """Undersample xs, ys to balance classes."""
    freqs = Counter(ys)
 
    #the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [15]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))



Counter({'4': 546277, '5': 468813, '3': 302214, '2': 154033, '1': 99626})
Counter({'4': 99626, '3': 99626, '5': 99626, '1': 99626, '2': 99626})


In [16]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,2))


# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)



  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


CPU times: user 3min 34s, sys: 3.09 s, total: 3min 37s
Wall time: 3min 37s


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

In [21]:
%%time
from sklearn.svm import LinearSVC
 
# initialise the SVM classifier
classifier = LinearSVC()
 
# train the classifier
classifier.fit(X_train, y_train)


CPU times: user 1min 16s, sys: 336 ms, total: 1min 16s
Wall time: 1min 16s


In [24]:
preds = classifier.predict(X_test)
print(list(preds[:20]))
print(y_test[:20])
 


['2', '2', '1', '2', '5', '1', '4', '2', '2', '3', '2', '4', '3', '4', '5', '1', '4', '4', '3', '3']
['2', '2', '1', '2', '4', '1', '3', '2', '4', '2', '1', '5', '3', '4', '4', '2', '5', '4', '3', '3']


In [25]:
print(texts[0])
print(texts[8])

Who would have guess that you would be able to get fairly decent Vietnamese restaurant in East York?   Not quite the same as Chinatown in terms of pricing (slightly higher) but definitely one of the better Vietnamese restaurants outside of the neighbourhood. When I don't have time to go to Chinatown, this is the next best thing as it is down the street from me.  So far the only items I have tried are the phos (beef, chicken & vegetarian) - and they have not disappointed me! Especially the chicken pho.  Next time I go back, I'm going to try the banh cuon (steamed rice noodle) and the vermicelli!
Came here for a burger as one of my friends said they had good burgers and people from work would come by twice a month for a burger run.  Ordered the original was was ok, wasn't as amazing as everyone says it to be. I think the veggie option would have been better.  A little pricy for a take-out burger.


In [26]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

0.5919286057560696


In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))


             precision    recall  f1-score   support

          1       0.69      0.77      0.73     32656
          2       0.54      0.49      0.51     33054
          3       0.52      0.50      0.51     32829
          4       0.51      0.47      0.49     32732
          5       0.66      0.73      0.69     33112

avg / total       0.59      0.59      0.59    164383



In [29]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, preds))
 


[[25212  5966  1029   229   220]
 [ 8665 16033  6920  1071   365]
 [ 1761  6516 16574  6360  1618]
 [  364   940  5903 15430 10095]
 [  289   245  1173  7351 24054]]
