# Modeling Yelp Data

Paul Lim

## Libraries

In [None]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import pipeline, feature_selection, decomposition
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.externals import joblib
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

# NLP 
from nltk.corpus import stopwords
from textblob import TextBlob
import spacy

# Misc.
import re
import datetime
import time
import logging
import math

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

## Running List of Functions/Classes

### Classes

In [95]:
class DataframeToSeries(BaseEstimator, TransformerMixin):
        
    def __init__(self, col=None):
        self.col = col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.col:
            return X[self.col]
        else:
            return X
        
class SeparateFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols=None):
        self.num_cols = num_cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_cols:
            return X.loc[:, self.num_cols]
        else:
            return X
        
class WilsonAverageTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_col=None, biz_list=None):
        self.num_col = num_col
        self.biz_list = biz_list
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_col and self.biz_list.all():
            scores = get_average_rating(X, self.biz_list)
            
            X_avg = pd.DataFrame({'average': scores})
            
            return X_avg
        else:
            return X
        
class CleanText(BaseEstimator, TransformerMixin):

    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        NLP = spacy.load('en')
        
        if self.text_col:
            df = pd.DataFrame()
            clean_review_list = []
            
            for review in X.loc[:, self.text_col]:
                clean_review = ''
                
                for word in NLP(review):
                    if word.is_stop == False:
                        clean_review += word.lemma_ + ' '
                        
#                 clean_review = NLP(clean_review)
                clean_review_list.append(clean_review)
                        
            df['clean_reviews'] = clean_review_list
            
            return df
        else:
            return X

### Functions

In [3]:
def confidence(pos, neg):
    '''
    Calculates the Wilson confidence where pos is the number of positive ratings
    and neg is the number of negative ratings.
    '''
    n = pos + neg
    
    if n == 0:
        return 0
    z = 1.96 # 95% confidence interval
    phat = float(pos) / n
    return (((phat + z*z/(2*n) - z * np.sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)))

def get_average_rating(df, biz_list):
    '''
    Compiles the list of average ratings for each business in biz_list.
    '''
    wils_list = []
    
    for biz in biz_list:
        ind_biz = df[df['name'] == biz]
        
        pos_count = 0
        neg_count = 0
        
        for rating in ind_biz['rating']:
            if rating > 3:
                pos_count += 1
            else:
                neg_count += 1
        
        wils_conf = confidence(pos_count, neg_count)
        wils_list.append(wils_conf)
    return wils_list

## Data Preprocessing

### Load in data

In [4]:
df_init = joblib.load('../data/df_tot')

### Reorder columns

In [5]:
df_init = df_init[['name', 'rating' ,'reviews']]

### Change rating to int type

In [6]:
df_init['rating'] = df_init['rating'].apply(lambda x: int(x))

### Create holdout set

In [7]:
df_shuffled = df_init.sample(frac=1)

holdout_size = round(len(df_shuffled)*0.2)

df_holdout = df_shuffled.iloc[:holdout_size, :]
df_model = df_shuffled.iloc[holdout_size:, :]

print("Holdout Size: ", len(df_holdout))
print("CV Size: ", len(df_model))

Holdout Size:  40939
CV Size:  163757


### Sort the dataframe for cross-validation

In [8]:
df_model = df_model.sort()

  if __name__ == '__main__':


### Create transformers for splitting text and num columns

In [10]:
t = SeparateFeatures(['reviews'])
n = SeparateFeatures()

print(t.transform(df_model).head())
print(n.transform(df_model).head())

                                             reviews
2  Great place to grab a coffee and you can get t...
3  Coffee is amazing! Overpriced as usual. There'...
4  Big time coffee lover. First time customer and...
6  I was attracted by the significant smell of co...
7  Great coffee!  The best pick-me-up latte in SF...
                 name  rating  \
2  0_FourBarrelCoffee       5   
3  0_FourBarrelCoffee       4   
4  0_FourBarrelCoffee       4   
6  0_FourBarrelCoffee       4   
7  0_FourBarrelCoffee       5   

                                             reviews  
2  Great place to grab a coffee and you can get t...  
3  Coffee is amazing! Overpriced as usual. There'...  
4  Big time coffee lover. First time customer and...  
6  I was attracted by the significant smell of co...  
7  Great coffee!  The best pick-me-up latte in SF...  


In [11]:
t.transform(df_model).shape
n.transform(df_model).shape

(163757, 3)

### Create transformer for averaging ratings for each business (may not need this since I might not average across the each business)

#### Get Wilson average for one business

In [12]:
one_biz = df_model[df_model['name'] == '0_FourBarrelCoffee']

In [13]:
pos_count = 0
neg_count = 0

for rating in one_biz['rating']:
    if rating > 3:
        pos_count += 1
    else:
        neg_count += 1
        
wil_conf = confidence(pos_count, neg_count)

#### Generalize to all businesses

In [14]:
biz_list = df_model['name'].unique()

In [15]:
wils_scores = get_average_rating(df_model, biz_list)

#### Create the transformer class

In [16]:
wat = WilsonAverageTransformer(num_col='rating', biz_list=biz_list)

In [17]:
wat.transform(df_model)

Unnamed: 0,average
0,0.700674
1,0.859991
2,0.803384
3,0.722555
4,0.828241
5,0.777013
6,0.696305
7,0.892771
8,0.606479
9,0.937218


### Create a transformer for preprocessing the reviews

#### Get the lemmatized review for just one review

In [32]:
one_review = df_model.iloc[0, 2]

In [34]:
nlp = spacy.load('en')

In [35]:
sample_doc = nlp(one_review)

In [44]:
clean_doc = ''
for word in sample_doc:
    if word.is_stop == False:
        clean_doc += word.lemma_ + ' '
clean_doc = nlp(clean_doc)

In [46]:
for sent in clean_doc.sents:
    print(sent)

great place grab coffee fancy stuff regular old coffee .    
go cold brew disappointed !    decor place spot - .


#### Generalize to all reviews

In [64]:
test_df = df_model.iloc[:100, :]

In [65]:
ct = CleanText('reviews')

test_ct = ct.transform(test_df)

In [67]:
tf = TfidfVectorizer()

tf_t = tf.fit_transform(test_ct.clean_reviews)

In [70]:
tf_t.todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [55]:
type(test_ct.clean_reviews[0])

spacy.tokens.doc.Doc

In [57]:
print(test_ct.clean_reviews[0])

great place grab coffee fancy stuff regular old coffee .    go cold brew disappointed !    decor place spot - . 


## Build the Pipeline

In [19]:
DB_baseline = Pipeline([
        ('combined_features', FeatureUnion([
                    
            ('split_num', SeparateFeatures(num_cols=['rating'])),
            ('text_vect', Pipeline([
                                
                ('split_text', SeparateFeatures()),
                ('to_series', DataframeToSeries(col='reviews')),
                ('tfidf', TfidfVectorizer())
            ]))           
        ])),
        ('pca', PCA(n_components=2))
        ('model', DBSCAN())
    ])

### Since the entire pipeline is crashing the kernel, the pipeline will be broken down into steps

In [124]:
text_processing = Pipeline([
                ('clean', CleanText('reviews')),
                ('to_series', DataframeToSeries(col='clean_reviews')),
                ('tfidf', TfidfVectorizer())
            ])

In [None]:
tp = text_processing.fit_transform(df_model)
tp_dense = tp.todense()
tp_dense[0].shape

In [None]:
text_processing2 = Pipeline([
#                 ('clean', CleanText('reviews')),
                ('to_series', DataframeToSeries(col='reviews')),
                ('tfidf', TfidfVectorizer())
            ])

In [None]:
tp2 = text_processing2.fit_transform(df_model)
tp_dense2 = tp2.todense()
tp_dense2[0].shape

In [228]:
test = Pipeline([
        ('combined_features', FeatureUnion([
                    
            ('split_num', SeparateFeatures(num_cols=['rating'])),
            ('text_vect', Pipeline([
                                
                ('split_text', SeparateFeatures()),
                ('to_series', DataframeToSeries(col='reviews')),
                ('tfidf', TfidfVectorizer())
            ]))           
        ]))
    ])

In [229]:
test.fit(df_model)

Pipeline(steps=[('combined_features', FeatureUnion(n_jobs=1,
       transformer_list=[('split_num', SeparateFeatures(num_cols=['rating'])), ('text_vect', Pipeline(steps=[('split_text', SeparateFeatures(num_cols=None)), ('to_series', DataframeToSeries(col='reviews')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class...      token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))]))],
       transformer_weights=None))])

In [230]:
tr = test.transform(df_model.loc[:, ['rating','reviews']])

In [223]:
test2 = Pipeline([                
                ('split_text', SeparateTextFeatures(text_cols=['reviews'])),
                ('to_series', DataframeToSeries(col='reviews')),
                ('tfidf', TfidfVectorizer())
        ])

In [224]:
tr = test2.fit_transform(df_model.loc[:, ['rating','reviews']])

In [225]:
print(tr)

  (0, 29887)	0.213129580924
  (0, 13931)	0.149201488054
  (0, 26276)	0.341041673582
  (0, 65820)	0.113467094109
  (0, 28911)	0.230539394172
  (0, 61184)	0.107249744207
  (0, 33559)	0.0962065260773
  (0, 27360)	0.0865870839582
  (0, 42353)	0.296836117074
  (0, 16756)	0.139752685236
  (0, 42151)	0.060035208411
  (0, 33269)	0.224782269641
  (0, 66102)	0.164212987356
  (0, 19539)	0.184879950741
  (0, 32656)	0.0598133518396
  (0, 40779)	0.154326019348
  (0, 7326)	0.185987239798
  (0, 55539)	0.0831169969863
  (0, 22774)	0.200889451567
  (0, 60388)	0.0908695900564
  (0, 35717)	0.197070142653
  (0, 65725)	0.111740422904
  (0, 36143)	0.145144205193
  (0, 4656)	0.047027555748
  (0, 23896)	0.263243343233
  :	:
  (163756, 35791)	0.0944716388189
  (163756, 53339)	0.0627826678407
  (163756, 31586)	0.0798230528509
  (163756, 42705)	0.0688598646156
  (163756, 23419)	0.0906473681525
  (163756, 27343)	0.0728647693251
  (163756, 23162)	0.147887676035
  (163756, 26363)	0.0865313437794
  (163756, 29205)	0.