# Deepir implementation
### The code is based on [deepir](https://github.com/TaddyLab/deepir) with slight modification
### Data is based on [Tripadvisor Review](http://www.cs.cmu.edu/~jiweil/html/hotel-review.html)

In [27]:
import re
import json
import pandas as pd

## Preprocessing data 

In [28]:
# all non alphanumeric
contractions = re.compile(r"'|-")
symbols = re.compile(r'(\W+)', re.U)
numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I)
swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I)
suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)')
seps = re.compile(r'\s+')

In [29]:
# cleaner (order matters)
def clean(text): 
    text = u' ' +  text.lower() + u' '
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = numeric.sub('000', text)
    text = swrd.sub(' ', text)
    #text = suffix.sub('', text)
    text = seps.sub(' ', text)
    return text

In [30]:
def preprocess(f_dir):
    data = []
    for line in open(f_dir,'r'):
        d = json.loads(line)
        txt = clean(d['text'])
        data.append([d['ratings']['overall'],txt])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [31]:
%time tripadvisor_df = preprocess('/home/ruoxu/workspace/data/public/tripadvisor_review/review.txt')

CPU times: user 11min 53s, sys: 6.15 s, total: 11min 59s
Wall time: 12min


In [32]:
tripadvisor_df

Unnamed: 0,stars,txt
0,5,stayed in king suite 000 nights yes it cots u...
1,5,on every visit nyc hotel beacon place we love...
2,4,this great property in midtown . we two diffe...
3,4,andaz nice hotel in central location of manha...
4,4,i have stayed at each of us andaz properties ...
5,5,excellent staff they remembered our names fro...
6,5,i stayed at setai 000 nights last week my com...
7,5,my husband i stayed at chatwal 000 nights in ...
8,5,wonderful boutique hotel located next times s...
9,4,this hotel nice stay nyc because rooms very c...


In [33]:
from sklearn.cross_validation import train_test_split
train_id, test_id = train_test_split(range(tripadvisor_df.shape[0]),test_size=0.3)
train_df = tripadvisor_df.iloc[train_id]
test_df = tripadvisor_df.iloc[test_id]

In [34]:
train_df.shape,test_df.shape

((614992, 2), (263569, 2))

## Re-write of deepir

In [35]:
import numpy as np
from gensim.models import Word2Vec
from gensim.models import Phrases
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

In [36]:
# define a review generator
alteos = re.compile(r'( [!\?] )')

def revsplit(l):
    l = alteos.sub(r' \1 . ', l).rstrip("( \. )*\n")
    return [s.split() for s in l.split(" . ")]

def YelpReviews(df):
    data = []
    for star,txt in df.values:
        data.append([star,revsplit(txt)])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [37]:
%time preprocessed_train_df = YelpReviews(train_df)

CPU times: user 1min 1s, sys: 20.3 s, total: 1min 22s
Wall time: 13min 7s


In [38]:
all_sentences = [s for r in preprocessed_train_df.txt for s in r]
len(all_sentences)

5596506

In [39]:
jointmodel = Word2Vec(workers=-1)
np.random.shuffle(all_sentences)
%time jointmodel.build_vocab(all_sentences)

CPU times: user 1min 13s, sys: 20.2 s, total: 1min 33s
Wall time: 9min 44s


In [40]:
def trainW2V(g, T=25):
    sent = [l for r in reviews[g] for l in r]
    model[g].min_alpha = model[g].alpha
    for epoch in range(T):
        print epoch, " "
        np.random.shuffle(sent)
        model[g].train(sent)
        model[g].alpha *= 0.9  
        model[g].min_alpha = model[g].alpha  
    print(".")

In [110]:
def getprobs(rev, grp,by = 'average'):
    if by == 'average':
        sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
        eta = pd.DataFrame(
                { g: model[g].score([s for i,s in sentences])  
                 for g in grp } )
        probs = eta.subtract( eta.max('columns'), 'rows') 
        probs = np.exp( probs )
        probs = probs.divide(probs.sum('columns'), "rows")
        probs['cnt'] = 1
        probs = probs.groupby([i for i,s in sentences]).sum()
        probs = probs.divide(probs["cnt"], 'rows').drop("cnt", 1)
    elif by == 'product':
        sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
        eta = pd.DataFrame(
                { g: model[g].score([s for i,s in sentences])  
                 for g in grp } )
        probs = eta.groupby([i for i,s in sentences]).sum()
        probs = probs.subtract( probs.max('columns'), 'rows')
        probs = np.exp( probs )
        probs = probs.divide(probs.sum('columns'), "rows")
    return(probs)

### classification by log product probability-- task c

In [61]:
docgrp_fine = {str(i) : i for i in range(0,6)} 
docgrp_fine

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5}

In [44]:
model = {}
reviews = {}
model = { g: deepcopy(jointmodel) for g in docgrp_fine }
for g in docgrp_fine:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars']==docgrp_fine[g]].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g,T=25 )

1 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
0 :
0



  
1



  
2  
3  
4



  
5  
6  
7  
8



  
9  
10  
11



  
12  
13  
14



  
15  
16  
17  
18



  
19  
20  
21



  
22  
23  
24  
.
3 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
2 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
5 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
4 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.


### Test difference between product and average 

In [108]:
testrev = {}
probs = {}
yhat = {}
g = '5'
testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values[:20]
print testrev[g][6]
probs[g] = getprobs(testrev[g], docgrp_fine)
print probs
yhat[g] = probs[g].idxmax("columns")
print yhat

[[u'i', u'was', u'tired', u'when', u'i', u'reached', u'this', u'hotel', u'after', u'arrival', u'on', u'victoria', u'clipper'], [u'checkin', u'was', u'easy', u'fast'], [u'receptionist', u'was', u'warm', u'friendly'], [u'she', u'told', u'me', u'about', u'dining', u'options', u'in', u'hotel', u'in', u'area', u'told', u'me', u'that', u'they', u'would', u'be', u'happy', u'help', u'me', u'during', u'my', u'stay', u'seemed', u'genuinely', u'mean', u'it'], [u'i', u'was', u'very', u'pleased', u'with', u'my', u'room', u'which', u'had', u'view', u'of', u'harbor'], [u'it', u'took', u'me', u'long', u'time', u'locate', u'switch', u'room', u'darkening', u'shades'], [u'they', u'electronic', u'book', u'in', u'room', u'simply', u'says', u'switch', u'on', u'wallbut', u'what', u'wall', u'?'], [u'switch', u'was', u'behind', u'lamp', u'on', u'bedside', u'table', u'with', u'radio', u'.........', u'after', u'leaving', u'my', u'bags', u'i', u'walked', u'downstairs', u'find', u'something', u'eat'], [u'there', u

In [111]:
testrev = {}
probs = {}
yhat = {}
g ='5'
testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values[:20]
print testrev[g][6]
probs[g] = getprobs(testrev[g], docgrp_fine, by='product')
print probs
yhat[g] = probs[g].idxmax("columns")
print yhat

[[u'i', u'was', u'tired', u'when', u'i', u'reached', u'this', u'hotel', u'after', u'arrival', u'on', u'victoria', u'clipper'], [u'checkin', u'was', u'easy', u'fast'], [u'receptionist', u'was', u'warm', u'friendly'], [u'she', u'told', u'me', u'about', u'dining', u'options', u'in', u'hotel', u'in', u'area', u'told', u'me', u'that', u'they', u'would', u'be', u'happy', u'help', u'me', u'during', u'my', u'stay', u'seemed', u'genuinely', u'mean', u'it'], [u'i', u'was', u'very', u'pleased', u'with', u'my', u'room', u'which', u'had', u'view', u'of', u'harbor'], [u'it', u'took', u'me', u'long', u'time', u'locate', u'switch', u'room', u'darkening', u'shades'], [u'they', u'electronic', u'book', u'in', u'room', u'simply', u'says', u'switch', u'on', u'wallbut', u'what', u'wall', u'?'], [u'switch', u'was', u'behind', u'lamp', u'on', u'bedside', u'table', u'with', u'radio', u'.........', u'after', u'leaving', u'my', u'bags', u'i', u'walked', u'downstairs', u'find', u'something', u'eat'], [u'there', u

In [19]:
## by average 25 iterations
%time preprocessed_test_df = YelpReviews(test_df)
testrev = {}
probs = {}
yhat = {}
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    probs[g] = getprobs(testrev[g], docgrp_fine)
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

CPU times: user 18.7 s, sys: 2.09 s, total: 20.8 s
Wall time: 20.6 s
        mcr       n
0  1.000000       4
1  0.248641   16003
2  0.718632   17980
3  0.648147   36717
4  0.534994   88186
5  0.235988  104679

Overall Fine-Scale MCR: 0.427


In [None]:
## by product 25 iterations
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    probs[g] = getprobs(testrev[g], docgrp_fine, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

In [21]:
### by combining two methods 25 iterations
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    if g == '1' or g == '5' or g =='0':
        probs[g] = getprobs(testrev[g], docgrp_fine)
    else:
        probs[g] = getprobs(testrev[g], docgrp_fine, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

        mcr       n
0  1.000000       4
1  0.248641   16003
2  0.604394   17980
3  0.545388   36717
4  0.439843   88186
5  0.235988  104679

Overall Fine-Scale MCR: 0.373


## Task a

In [22]:
docgrp_a = {}
docgrp_a['neg'] = [0,1,2]
docgrp_a['pos'] = [3,4,5]
docgrp_a

{'neg': [0, 1, 2], 'pos': [3, 4, 5]}

In [23]:
model = {}
for g in docgrp_a:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars'].isin(docgrp_a[g])].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g )

neg :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.
pos :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
20



  
21



  
22



  
23



  
24



  
.


In [24]:
## results by average
for g in docgrp_a:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_a[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_a)
    yhat[g] = probs[g].idxmax("columns")

mc_a = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_a},
    'n': {g: len(testrev[g]) for g in docgrp_a}
    })
print(mc_a)

ntest = mc_a['n'].sum()
overall_a = mc_a.product("columns").sum()/ntest
print("\nOverall Task A MCR: %.3f" %overall_a)



KeyError: 0

In [None]:
## results by product
for g in docgrp_a:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_a[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_a, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_a = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_a},
    'n': {g: len(testrev[g]) for g in docgrp_a}
    })
print(mc_a)

ntest = mc_a['n'].sum()
overall_a = mc_a.product("columns").sum()/ntest
print("\nOverall Task A MCR: %.3f" %overall_a)

# Task b

In [None]:
docgrp_b = {}
docgrp_b['neg'] = [0,1]
docgrp_b['neu'] = [2,3]
docgrp_b['pos'] = [4,5]
docgrp_b

In [None]:
model = {}
for g in docgrp_b:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars'].isin(docgrp_b[g])].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g,T=25 )

In [None]:
## by average
for g in docgrp_b:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_b[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_b)
    yhat[g] = probs[g].idxmax("columns")

mc_b = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_b},
    'n': {g: len(testrev[g]) for g in docgrp_b}
    })
print(mc_b)

ntest = mc_b['n'].sum()
overall_b = mc_b.product("columns").sum()/ntest
print("\nOverall Taks b MCR: %.3f" %overall_b)

In [None]:
# by product
for g in docgrp_b:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_b[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_b,by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_b = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_b},
    'n': {g: len(testrev[g]) for g in docgrp_b}
    })
print(mc_b)

ntest = mc_b['n'].sum()
overall_b = mc_b.product("columns").sum()/ntest
print("\nOverall Taks b MCR: %.3f" %overall_b)

In [None]:
### by combining two methods 25 iterations
for g in docgrp_b:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_b[g])].txt.values
    if g == 'neg' or g == 'pos':
        probs[g] = getprobs(testrev[g], docgrp_b)
    else:
        probs[g] = getprobs(testrev[g], docgrp_b, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_b = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_b},
    'n': {g: len(testrev[g]) for g in docgrp_b}
    })
print(mc_b)

ntest = mc_b['n'].sum()
overall_b = mc_b.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_b)