In [3]:

import json
 
# read the data from disk and split into lines
# we use .strip() to remove the final (empty) line
with open("xaa.json",encoding="utf8") as f:
    reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in reviews] 
texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [36]:
reviews[3]

{'business_id': 'uYHaNptLzDLoV_JZ_MuzUA',
 'cool': 0,
 'date': '2016-08-21',
 'funny': 0,
 'review_id': 'llmdwOgDReucVoWEry61Lw',
 'stars': 4,
 'text': "Location is everything and this hotel has it! The reception is inviting and open 24 hours. They are very helpful and have a lot of patience answering all my questions about where to go etc. there is also a lounge open 24 hours with snack-type food. Breakfast is continental-style so if you want heartier fare look elsewhere though you don't have to go far. The bus and train stations are right across the street so it's easy access to the airport or anywhere else you may want to go. Turn uphill to old town or cross the bridge to new town. The room with a view i got was spacious and comfortable though it's a bit of a maze to find it-just follow the signs. The windows are double paned so the room is quiet plus i was on the 5th floor which helps. It's a bit pricey but still one of the best values i found!",
 'useful': 0,
 'user_id': 'oU2SSOms

In [13]:
from collections import Counter
 
def balance_classes(xs, ys):

    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if (num_added[y] < max_allowable):
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [14]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))

Counter({5: 8416, 4: 5114, 3: 2499, 1: 2235, 2: 1736})
Counter({1: 1736, 2: 1736, 3: 1736, 4: 1736, 5: 1736})


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,2))

 
# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

In [21]:
from sklearn.svm import LinearSVC
 
# initialise the SVM classifier
classifier = LinearSVC()
 
# train the classifier
classifier.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [22]:
preds = classifier.predict(X_test)
print(list(preds[:10]))
print(y_test[:10])

[4, 5, 4, 1, 3, 5, 3, 2, 3, 4]
[4, 1, 4, 1, 4, 2, 4, 2, 3, 3]


In [23]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

0.592670157068


In [39]:
#pre-processing-step-1 => conversion of upper case to lower case
texts = [x.lower() for x in texts]

In [40]:
texts[1]

"if you need an inexpensive place to stay for a night or two then you may consider this place but for a longer stay i'd recommend somewhere with better amenities. \n\npros:\ngreat location- you're right by the train station, central location to get to old town and new town, and right by sight seeing his tours. food, bars, and shopping all within walking distance. location, location, location.\nvery clean and very good maid service\n\ncons:\ntiny rooms \nuncomfortable bed \nabsolutely no amenities \nno phone in room \nno wardrobe \n\nwas given a lot of attitude about me and my husband sharing a room which was quite strange and we were charged 15 pounds more for double occupancy not sure why that matters i felt like it was a money grab. it was just handled in a kind of odd manner to me... \n\nif you book this hotel all you get is a bed, desk, and a bathroom. it isn't awful but know what you're getting into."