In [1]:
from collections import Counter

import json

In [2]:
%%time
reviews = []
with open("yelp_academic_dataset_review.json") as f:
    for line in f:
        reviews.append(json.loads(line))

CPU times: user 39 s, sys: 4.73 s, total: 43.7 s
Wall time: 43.2 s


In [5]:
print(reviews[0])

{'cool': 0, 'useful': 0, 'funny': 0, 'review_id': 'NxL8SIC5yqOdnlXCg18IBg', 'business_id': '2aFiy99vNLklCx3T_tGS9A', 'stars': 5, 'user_id': 'KpkOkG6RIf4Ra25Lhhxf1A', 'text': "If you enjoy service by someone who is as competent as he is personable, I would recommend Corey Kaplan highly. The time he has spent here has been very productive and working with him educational and enjoyable. I hope not to need him again (though this is highly unlikely) but knowing he is there if I do is very nice. By the way, I'm not from El Centro, CA. but Scottsdale, AZ.", 'date': '2011-10-10', 'type': 'review'}


In [28]:
long_reviews = [x for x in reviews if x['text'].count(" ") > 300]

In [35]:
author_counts = Counter([review['user_id'] for review in long_reviews])

In [36]:
no_unique_authors = [x for x in long_reviews if author_counts[x['user_id']] > 1]

In [37]:
len(no_unique_authors)

180085

In [38]:
# take exactly two of each author

In [39]:
# {author_id: [review1, review2]}
author_review = {}

for review in no_unique_authors:
    uid = review['user_id']
    if uid in author_review:
        if len(author_review[uid]) == 1:
            author_review[uid].append(review)
    else:
        author_review[uid] = [review]

In [40]:
counts = [len(author_review[k]) for k in author_review]
all([c == 2 for c in counts])

True

In [41]:
len(author_review)

30930

In [42]:
known_texts = []
unknown_texts = []

for author in author_review:
    total_texts = len(author_review[author])
    half_texts = int(total_texts/2)
    total_texts = half_texts * 2  # remove last one if odd
    known_texts += [review['text'] for review in author_review[author][:half_texts]]
    unknown_texts += [review['text'] for review in author_review[author][half_texts:total_texts]]

In [43]:
known_texts[245]

'It had been about a year and a half since I last saw her, but she came blazing back into my life on my holiday trip to Vegas. Without much debate, it was agreed that we would spend Christmas day out in the desert to celebrate our happy reunion. I felt the winter desert chill under my jacket as the sun began its daily journey below the horizon. And she, dressed in green, looked as beautiful as the last time I saw her. She had not changed one bit, still living life like it would burn out within the span of a half hour\'s time. Like always, she stormed back into my life and took over, burning my lips with her kiss and devastating my emotions with her words. In her presence, every fleeting thought escaped into a rabbit hole into which I felt compelled to pursue with singular obsession and undivided attention. Why was the desert so brown? Why was this road that we were on so long and windy? And what the hell did I do with my meager existence in the years 2002 and 2003? Time slowed and ever

In [44]:
unknown_texts[245]

"Wow Hopper Hut.  Where do I begin?  The food here was awesome.  My sister and I visited Toronto recently and I had started a thread in the Toronto Talk page asking for suggestions on restaurants to try.  Kat F chimed in to say that she has a List Of Things For New Yorkers To Do In Toronto list.  Hahhah!  It's Yelp sarcasm.  Very funny.\n\nBut wait.  She wasn't kidding.  She really does have such a list.  And it's got 28 items on it.  At the time of this review, Kat F has 101 lists.  She is out of control with those lists.  If it is possible to have lists of lists, she'd probably have a few of those, too.\n\nAnd so it was, from this list I picked out Hopper Hut.  I have never had Sri Lankan food before, and neither has my aunt, two cousins, and my sister.  Altogether there were five of us ready to try Sri Lankan food for the first time.  We had no idea what to expect.  A few of us thought it would be like Indian food.  But there are different regional varieties of Indian food, I mainta

In [45]:
len(known_texts)

30930

In [46]:
total = 30000
half = int(total/2)

known_same = known_texts[:half]
unknown_same = unknown_texts[:half]

known_diff = known_texts[half:total]
unknown_diff = unknown_texts[half:total]

# move unknown diffs up by one
unknown_diff = unknown_diff[1:] + [unknown_diff[0]]

knowns = known_same + known_diff
unknowns = unknown_same + unknown_diff

In [47]:
len(knowns)

30000

In [48]:
n = int(len(knowns)/2)
labels = ([1] * n) + ([0] * n)

In [60]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
char_tf = TfidfVectorizer(analyzer='char', ngram_range=(2,4), min_df=5, lowercase=False)
word_tf = TfidfVectorizer(ngram_range=(1,2), lowercase=False, min_df=5)
vectorizer = FeatureUnion([
    ('char', char_tf),
    ('word', word_tf)
])

# vectorizer = TfidfVectorizer(lowercase=False, min_df=5)

vectorizer.fit(knowns + unknowns)

CPU times: user 5min 8s, sys: 4.31 s, total: 5min 12s
Wall time: 5min 12s


In [61]:
%%time
known_vecs = vectorizer.transform(knowns)
print(".")
unknown_vecs = vectorizer.transform(unknowns)

.
CPU times: user 6min 11s, sys: 8.11 s, total: 6min 19s
Wall time: 6min 20s


In [62]:
len(knowns)

30000

In [63]:
known_vecs.shape

(30000, 709706)

In [64]:
len(labels)

30000

In [65]:
from random import shuffle
indices = list(range(len(labels)))
shuffle(indices)
indices[:10]

[13891, 17312, 21909, 14342, 14699, 8793, 14256, 17930, 23093, 1466]

In [66]:
len(indices)

30000

In [67]:
import numpy as np
labels = np.array(labels)

In [68]:
train_indices = indices[:20000]
test_indices = indices[20000:]

known_train = known_vecs[train_indices, :]
unknown_train = unknown_vecs[train_indices, :]
train_labels = labels[train_indices]

known_test = known_vecs[test_indices, :]
unknown_test = unknown_vecs[test_indices, :]
test_labels = labels[test_indices]

In [69]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

In [70]:
%%time
train_pairs = np.abs(known_train - unknown_train)
test_pairs = np.abs(known_test - unknown_test)
svm = LinearSVC()
svm.fit(train_pairs, train_labels)
preds = svm.predict(test_pairs)
print(classification_report(test_labels, preds))

# print(cross_val_score(LinearSVC(), np.abs(known_vecs - unknown_vecs), labels, cv=5))
# print(np.mean(cross_val_score(LinearSVC(), np.abs(known_vecs - unknown_vecs), labels, cv=3)))

             precision    recall  f1-score   support

          0       0.77      0.76      0.77      5011
          1       0.77      0.78      0.77      4989

avg / total       0.77      0.77      0.77     10000

CPU times: user 28.9 s, sys: 1.45 s, total: 30.3 s
Wall time: 30.3 s
