# Deepir implementation
### The code is based on [deepir](https://github.com/TaddyLab/deepir) with slight modification
### Data is based on [Kaggle Yelp Review](https://www.kaggle.com/c/yelp-recruiting/data)

# 1. Yelp data

In [11]:
import re
import json
import pandas as pd

## Preprocessing data 

In [12]:
# all non alphanumeric
contractions = re.compile(r"'|-")
symbols = re.compile(r'(\W+)', re.U)
numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I)
swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I)
suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)')
seps = re.compile(r'\s+')

In [13]:
# cleaner (order matters)
def clean(text): 
    text = u' ' +  text.lower() + u' '
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = numeric.sub('000', text)
    text = swrd.sub(' ', text)
    #text = suffix.sub('', text)
    text = seps.sub(' ', text)
    return text

In [14]:
def preprocess(f_dir):
    data = []
    for line in open(f_dir,'r'):
        d = json.loads(line)
        txt = clean(d['text'])
        data.append([d['stars'],txt])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [16]:
%time train_df = preprocess('data/yelp_review_small_set/yelp_training_set/yelp_training_set_review.json')

CPU times: user 2min 59s, sys: 2.13 s, total: 3min 1s
Wall time: 3min 1s


In [17]:
train_df

Unnamed: 0,stars,txt
0,5,my wife took me here on my birthday breakfast...
1,5,i have no idea why some people give bad revie...
2,4,love gyro plate . rice so good i also dig the...
3,5,rosie dakota i love chaparral dog park !!! it...
4,5,general manager scott petello good egg !!! no...
5,4,quiessence simply put beautiful . full window...
6,5,drop what youre doing drive here . after i at...
7,4,luckily i didnt have travel far make my conne...
8,4,definitely come happy hour ! prices amazing s...
9,5,nobuo shows his unique talents with everythin...


In [18]:
%time test_df = preprocess('data/yelp_review_small_set/yelp_test_set/yelp_test_set_review.json')

CPU times: user 15.8 s, sys: 218 ms, total: 16 s
Wall time: 16 s


In [19]:
test_df

Unnamed: 0,stars,txt
0,5,nice place big patio . now offering live sket...
1,5,friendly staff . make sure you order gyro pla...
2,5,love love love this place breakfast . they al...
3,1,disgusting sandwich . i should have known bet...
4,4,always fan of cafe zupas their very friendly ...
5,5,when i first get there i check lot see deals ...
6,4,great salsa especially if you mix red green t...
7,4,ajs unsweeted tea selection amazing ! with so...
8,4,i stop in here from time time with friend who...
9,2,ugh . i want love this place like some of my ...


## Re-write of deepir

In [20]:
import numpy as np
from gensim.models import Word2Vec
from gensim.models import Phrases
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

Couldn't import dot_parser, loading of dot files will not be possible.


Using gpu device 0: Quadro K2000 (CNMeM is disabled)


In [21]:
# define a review generator
alteos = re.compile(r'( [!\?] )')

def revsplit(l):
    l = alteos.sub(r' \1 . ', l).rstrip("( \. )*\n")
    return [s.split() for s in l.split(" . ")]

def YelpReviews(df):
    data = []
    for star,txt in df.values:
        data.append([star,revsplit(txt)])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [22]:
%time preprocessed_train_df = YelpReviews(train_df)

CPU times: user 15.3 s, sys: 1.54 s, total: 16.9 s
Wall time: 16.8 s


In [23]:
all_sentences = [s for r in preprocessed_train_df.txt for s in r]
len(all_sentences)

2027394

In [28]:
jointmodel = Word2Vec(workers=-1)
np.random.shuffle(all_sentences)
jointmodel.build_vocab(all_sentences)

In [24]:
def trainW2V(g, T=25):
    sent = [l for r in reviews[g] for l in r]
    model[g].min_alpha = model[g].alpha
    for epoch in range(T):
        print epoch, " "
        np.random.shuffle(sent)
        model[g].train(sent)
        model[g].alpha *= 0.9  
        model[g].min_alpha = model[g].alpha  
    print(".")

In [25]:
def getprobs(rev, grp,by = 'average'):
    if by == 'average':
        sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
        eta = pd.DataFrame(
                { g: model[g].score([s for i,s in sentences])  
                 for g in grp } )
        probs = eta.subtract( eta.max('columns'), 'rows') 
        probs = np.exp( probs )
        probs = probs.divide(probs.sum('columns'), "rows")
        probs['cnt'] = 1
        probs = probs.groupby([i for i,s in sentences]).sum()
        probs = probs.divide(probs["cnt"], 'rows').drop("cnt", 1)
    elif by == 'product':
#         sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
#         eta = pd.DataFrame(
#                 { g: model[g].score([s for i,s in sentences])  
#                  for g in grp } )
#         probs = eta.groupby([i for i,s in sentences]).sum()
#         probs = probs.subtract( probs.max('columns'), 'rows')
#         probs = np.exp( probs )
#         probs = probs.divide(probs.sum('columns'), "rows")
        sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
        eta = pd.DataFrame(
                { g: model[g].score([s for i,s in sentences])  
                 for g in grp } )
        probs = eta.groupby([i for i,s in sentences]).sum()
        probs = probs.subtract( probs.min('columns'), 'rows')
        #probs = np.exp( probs )
        probs = probs.divide(probs.sum('columns'), "rows")
    return(probs)

### classification by log product probability-- task c

In [26]:
docgrp_fine = {str(i) : i for i in range(1,6)} 
docgrp_fine

{'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}

In [29]:
model = {}
reviews = {}
model = { g: deepcopy(jointmodel) for g in docgrp_fine }
for g in docgrp_fine:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars']==docgrp_fine[g]].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g,T=25 )

1 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
3 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
2 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
5 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
4 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.


In [30]:
## by average 25 iterations
%time preprocessed_test_df = YelpReviews(test_df)
testrev = {}
probs = {}
yhat = {}
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    probs[g] = getprobs(testrev[g], docgrp_fine)
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

CPU times: user 1.19 s, sys: 192 ms, total: 1.38 s
Wall time: 1.29 s
        mcr     n
1  0.253782  2380
2  0.656082  2047
3  0.640576  2849
4  0.550923  6883
5  0.264408  8797

Overall Fine-Scale MCR: 0.431


In [31]:
## by product 25 iterations
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    probs[g] = getprobs(testrev[g], docgrp_fine, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

        mcr     n
1  0.301681  2380
2  0.606742  2047
3  0.595648  2849
4  0.462880  6883
5  0.316585  8797

Overall Fine-Scale MCR: 0.419


In [43]:
groups = ['1','2','3','4','5']
prediction_a = getprobs(preprocessed_test_df.txt.values,groups)
prediction_a['label'] = pd.Series(preprocessed_test_df.stars.values,index=prediction_a.index)
prediction_a['prediction'] = pd.Series(prediction_a[groups].idxmax('columns').values,index=prediction_a.index).astype(np.float64)
prediction_a['txt'] = pd.Series(preprocessed_test_df.txt.values,index=prediction_a.index)


prediction_p = getprobs(preprocessed_test_df.txt.values,groups,by='product')
prediction_p['label'] = pd.Series(preprocessed_test_df.stars.values,index=prediction_p.index)
prediction_p['prediction'] = pd.Series(prediction_p[groups].idxmax('columns').values,index=prediction_p.index).astype(np.float64)
prediction_p['txt'] = pd.Series(preprocessed_test_df.txt.values,index=prediction_p.index)

In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(prediction_a.prediction.values,prediction_a.label.values)

array([[1776,  636,  327,  297,  383],
       [ 386,  704,  385,  158,   72],
       [ 104,  479, 1024,  841,  200],
       [  43,  137,  782, 3091, 1671],
       [  71,   91,  331, 2496, 6471]])

In [45]:
confusion_matrix(prediction_p.prediction.values.astype(np.int),prediction_p.label.values)

array([[   0,    9,    4,    3,    4,    9],
       [   0, 1662,  458,  227,  196,  321],
       [   0,  497,  805,  408,  152,   60],
       [   0,  119,  573, 1152,  901,  183],
       [   0,   45,  154,  831, 3697, 2212],
       [   0,   48,   53,  228, 1933, 6012]])