# W2V average + Random Forest
### Data is based on [Kaggle Yelp Review](https://www.kaggle.com/c/yelp-recruiting/data)

# 1. Yelp data

In [1]:
import re
import json
import pandas as pd

## Preprocessing data 

In [2]:
# get words from reviews
words_pat =  re.compile(r'\w+',re.IGNORECASE)

In [3]:
def get_words(text):
    return words_pat.findall(text)

In [4]:
def preprocess(f_dir):
    data = []
    for line in open(f_dir,'r'):
        d = json.loads(line)
        txt = get_words(d['text'].lower())
        data.append([d['stars'],txt])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [5]:
%time train_df = preprocess('data/yelp_review_small_set/yelp_training_set/yelp_training_set_review.json')

CPU times: user 26.2 s, sys: 2.39 s, total: 28.6 s
Wall time: 28.6 s


In [6]:
train_df

Unnamed: 0,stars,txt
0,5,"[my, wife, took, me, here, on, my, birthday, f..."
1,5,"[i, have, no, idea, why, some, people, give, b..."
2,4,"[love, the, gyro, plate, rice, is, so, good, a..."
3,5,"[rosie, dakota, and, i, love, chaparral, dog, ..."
4,5,"[general, manager, scott, petello, is, a, good..."
5,4,"[quiessence, is, simply, put, beautiful, full,..."
6,5,"[drop, what, you, re, doing, and, drive, here,..."
7,4,"[luckily, i, didn, t, have, to, travel, far, t..."
8,4,"[definitely, come, for, happy, hour, prices, a..."
9,5,"[nobuo, shows, his, unique, talents, with, eve..."


In [7]:
%time test_df = preprocess('data/yelp_review_small_set/yelp_test_set/yelp_test_set_review.json')

CPU times: user 2 s, sys: 117 ms, total: 2.12 s
Wall time: 2.12 s


In [8]:
test_df

Unnamed: 0,stars,txt
0,5,"[nice, place, big, patio, now, offering, live,..."
1,5,"[friendly, staff, make, sure, you, order, the,..."
2,5,"[love, love, love, this, place, for, breakfast..."
3,1,"[disgusting, sandwich, i, should, have, known,..."
4,4,"[always, a, fan, of, cafe, zupas, and, their, ..."
5,5,"[when, i, first, get, there, i, check, the, lo..."
6,4,"[great, salsa, especially, if, you, mix, the, ..."
7,4,"[aj, s, unsweeted, tea, selection, is, amazing..."
8,4,"[i, stop, in, here, from, time, to, time, with..."
9,2,"[ugh, i, want, to, love, this, place, like, so..."


### Load W2V model

In [12]:
from gensim.models import Word2Vec
%time w2v_model = Word2Vec.load_word2vec_format("/home/ruoxu/opt/word2vec/training_result/GoogleNews-vectors-negative300.bin",binary=True)

CPU times: user 2min 11s, sys: 12.4 s, total: 2min 23s
Wall time: 2min 24s


In [23]:
def sent_vec(words,model):
    vecs = []
    for w in words:
        try:
            vecs.append(model[w])
        except:
            pass
    return np.average(vecs)

In [16]:
import numpy as np
from functools import partial

In [21]:
def sent_vec_mean(words, model):
    r = np.zeros( (len(words), 300) )
    wc = 0.0
    for i, w in enumerate(words):
        if w in model:
            r[i, :] = model[w]
            wc += 1
        else:
            pass
    return r.sum(axis = 0) / wc

In [22]:
%time review_vecs = train_df.txt.apply(partial(sent_vec_mean, model = w2v_model))

CPU times: user 2min 10s, sys: 3.26 s, total: 2min 13s
Wall time: 2min 14s


In [24]:
%time review_vecs_ = train_df.txt.apply(partial(sent_vec, model = w2v_model))

CPU times: user 1min 27s, sys: 3.02 s, total: 1min 30s
Wall time: 1min 30s




In [29]:
review_vecs.apply(lambda r:r.shape ).value_counts()

(300,)    229907
dtype: int64

In [48]:
X_train = np.row_stack(review_vecs.fillna(0))
X_train[np.isnan(X_train)] = 0
y_train = train_df.stars
print X_train.shape, y_train.shape

(229907, 300) (229907,)


In [37]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [None]:
%time classifier.fit(X_train,y_train)