# Deepir implementation
### The code is based on [deepir](https://github.com/TaddyLab/deepir) with slight modification
### Data is based on [Tripadvisor Review](http://www.cs.cmu.edu/~jiweil/html/hotel-review.html)

# 1. Yelp data

In [1]:
import re
import json
import pandas as pd

## Preprocessing data 

In [2]:
# all non alphanumeric
contractions = re.compile(r"'|-")
symbols = re.compile(r'(\W+)', re.U)
numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I)
swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I)
suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)')
seps = re.compile(r'\s+')

In [3]:
# cleaner (order matters)
def clean(text): 
    text = u' ' +  text.lower() + u' '
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = numeric.sub('000', text)
    text = swrd.sub(' ', text)
    #text = suffix.sub('', text)
    text = seps.sub(' ', text)
    return text

In [4]:
def preprocess(f_dir):
    data = []
    for line in open(f_dir,'r'):
        d = json.loads(line)
        txt = clean(d['text'])
        data.append([d['ratings']['overall'],txt])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [5]:
%time tripadvisor_df = preprocess('data/tripadivisor_review/review.txt')

CPU times: user 7min 47s, sys: 1.48 s, total: 7min 48s
Wall time: 7min 48s


In [6]:
tripadvisor_df

Unnamed: 0,stars,txt
0,5,stayed in king suite 000 nights yes it cots u...
1,5,on every visit nyc hotel beacon place we love...
2,4,this great property in midtown . we two diffe...
3,4,andaz nice hotel in central location of manha...
4,4,i have stayed at each of us andaz properties ...
5,5,excellent staff they remembered our names fro...
6,5,i stayed at setai 000 nights last week my com...
7,5,my husband i stayed at chatwal 000 nights in ...
8,5,wonderful boutique hotel located next times s...
9,4,this hotel nice stay nyc because rooms very c...


In [8]:
from sklearn.cross_validation import train_test_split
train_id, test_id = train_test_split(range(tripadvisor_df.shape[0]),test_size=0.3)
train_df = tripadvisor_df.iloc[train_id]
test_df = tripadvisor_df.iloc[test_id]

In [10]:
train_df.shape,test_df.shape

((614992, 2), (263569, 2))

## Re-write of deepir

In [11]:
import numpy as np
from gensim.models import Word2Vec
from gensim.models import Phrases
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

Couldn't import dot_parser, loading of dot files will not be possible.


In [12]:
# define a review generator
alteos = re.compile(r'( [!\?] )')

def revsplit(l):
    l = alteos.sub(r' \1 . ', l).rstrip("( \. )*\n")
    return [s.split() for s in l.split(" . ")]

def YelpReviews(df):
    data = []
    for star,txt in df.values:
        data.append([star,revsplit(txt)])
    return pd.DataFrame(data=data,columns=['stars','txt'])

In [13]:
%time preprocessed_train_df = YelpReviews(train_df)

CPU times: user 27.1 s, sys: 1.54 s, total: 28.7 s
Wall time: 28.5 s


In [14]:
all_sentences = [s for r in preprocessed_train_df.txt for s in r]
len(all_sentences)

5595595

In [15]:
jointmodel = Word2Vec(workers=8)
np.random.shuffle(all_sentences)
jointmodel.build_vocab(all_sentences)

In [17]:
def trainW2V(g, T=25):
    sent = [l for r in reviews[g] for l in r]
    model[g].min_alpha = model[g].alpha
    for epoch in range(T):
        print epoch, " "
        np.random.shuffle(sent)
        model[g].train(sent)
        model[g].alpha *= 0.9  
        model[g].min_alpha = model[g].alpha  
    print(".")

In [18]:
def getprobs(rev, grp,by = 'average'):
    if by == 'average':
        sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
        eta = pd.DataFrame(
                { g: model[g].score([s for i,s in sentences])  
                 for g in grp } )
        probs = eta.subtract( eta.max('columns'), 'rows') 
        probs = np.exp( probs )
        probs = probs.divide(probs.sum('columns'), "rows")
        probs['cnt'] = 1
        probs = probs.groupby([i for i,s in sentences]).sum()
        probs = probs.divide(probs["cnt"], 'rows').drop("cnt", 1)
    elif by == 'product':
        sentences =  [(i,s) for i,r in enumerate(rev) for s in r]
        eta = pd.DataFrame(
                { g: model[g].score([s for i,s in sentences])  
                 for g in grp } )
        probs = eta.groupby([i for i,s in sentences]).sum()
        probs = probs.subtract( probs.max('columns'), 'rows')
        probs = np.exp( probs )
        probs = probs.divide(probs.sum('columns'), "rows")
    return(probs)

In [19]:
train_df.stars.unique()

array([ 4.,  1.,  2.,  5.,  3.,  0.])

### classification by log product probability-- task c

In [20]:
docgrp_fine = {str(i) : i for i in range(1,6)} 
docgrp_fine

{'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}

In [None]:
model = {}
reviews = {}
model = { g: deepcopy(jointmodel) for g in docgrp_fine }
for g in docgrp_fine:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars']==docgrp_fine[g]].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g,T=20 )

1 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
.
3 :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19

In [None]:
## by average 25 iterations
%time preprocessed_test_df = YelpReviews(test_df)
testrev = {}
probs = {}
yhat = {}
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    probs[g] = getprobs(testrev[g], docgrp_fine)
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

In [None]:
## by product 25 iterations
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    probs[g] = getprobs(testrev[g], docgrp_fine, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

In [None]:
### by combining two methods 25 iterations
for g in docgrp_fine:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars']==docgrp_fine[g]].txt.values
    if g == '1' or g == '5':
        probs[g] = getprobs(testrev[g], docgrp_fine)
    else:
        probs[g] = getprobs(testrev[g], docgrp_fine, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_fine = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_fine},
    'n': {g: len(testrev[g]) for g in docgrp_fine}
    })
print(mc_fine)

ntest = mc_fine['n'].sum()
overall_fine = mc_fine.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_fine)

## Task a

In [136]:
docgrp_a = {}
docgrp_a['neg'] = [1,2]
docgrp_a['pos'] = [3,4,5]
docgrp_a

{'neg': [1, 2], 'pos': [3, 4, 5]}

In [133]:
model = {}
for g in docgrp_a:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars'].isin(docgrp_a[g])].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g )

neg :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
.
pos :
0



  
1



  
2



  
3



  
4



  
5



  
6



  
7



  
8



  
9



  
10



  
11



  
12



  
13



  
14



  
15



  
16



  
17



  
18



  
19



  
.


In [138]:
## results by average
for g in docgrp_a:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_a[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_a)
    yhat[g] = probs[g].idxmax("columns")

mc_a = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_a},
    'n': {g: len(testrev[g]) for g in docgrp_a}
    })
print(mc_a)

ntest = mc_a['n'].sum()
overall_a = mc_a.product("columns").sum()/ntest
print("\nOverall Task A MCR: %.3f" %overall_a)

          mcr      n
neg  0.165575   4427
pos  0.096659  18529

Overall Task A MCR: 0.110


In [139]:
## results by product
for g in docgrp_a:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_a[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_a, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_a = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_a},
    'n': {g: len(testrev[g]) for g in docgrp_a}
    })
print(mc_a)

ntest = mc_a['n'].sum()
overall_a = mc_a.product("columns").sum()/ntest
print("\nOverall Task A MCR: %.3f" %overall_a)

          mcr      n
neg  0.162187   4427
pos  0.102326  18529

Overall Task A MCR: 0.114


# Task b

In [68]:
docgrp_b = {}
docgrp_b['neg'] = [1,2]
docgrp_b['neu'] = [3]
docgrp_b['pos'] = [4,5]
docgrp_b

{'neg': [1, 2], 'neu': [3], 'pos': [4, 5]}

In [None]:
model = {}
for g in docgrp_b:
    print g, ":"
    reviews[g] = preprocessed_train_df[preprocessed_train_df['stars'].isin(docgrp_b[g])].txt
    model[g] = deepcopy(jointmodel)
    trainW2V( g,T=25 )

neg :
0



  
1



  
2



  
3



  
4

In [None]:
## by average
for g in docgrp_b:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_b[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_b)
    yhat[g] = probs[g].idxmax("columns")

mc_b = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_b},
    'n': {g: len(testrev[g]) for g in docgrp_b}
    })
print(mc_b)

ntest = mc_b['n'].sum()
overall_b = mc_b.product("columns").sum()/ntest
print("\nOverall Taks b MCR: %.3f" %overall_b)

In [None]:
# by product
for g in docgrp_b:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_b[g])].txt.values
    probs[g] = getprobs(testrev[g], docgrp_b,by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_b = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_b},
    'n': {g: len(testrev[g]) for g in docgrp_b}
    })
print(mc_b)

ntest = mc_b['n'].sum()
overall_b = mc_b.product("columns").sum()/ntest
print("\nOverall Taks b MCR: %.3f" %overall_b)

In [None]:
### by combining two methods 25 iterations
for g in docgrp_b:
    testrev[g] =  preprocessed_test_df[preprocessed_test_df['stars'].isin(docgrp_b[g])].txt.values
    if g == 'neg' or g == 'pos':
        probs[g] = getprobs(testrev[g], docgrp_b)
    else:
        probs[g] = getprobs(testrev[g], docgrp_b, by='product')
    yhat[g] = probs[g].idxmax("columns")

mc_b = pd.DataFrame({
    'mcr': {g: (yhat[g] != g).mean() for g in docgrp_b},
    'n': {g: len(testrev[g]) for g in docgrp_b}
    })
print(mc_b)

ntest = mc_b['n'].sum()
overall_b = mc_b.product("columns").sum()/ntest
print("\nOverall Fine-Scale MCR: %.3f" %overall_b)