# Modeling Yelp Data

Paul Lim

## Libraries

In [173]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import pipeline, feature_selection, decomposition
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.externals import joblib
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.cluster import DBSCAN

# NLP 
from nltk.corpus import stopwords
from textblob import TextBlob
import spacy

# Misc.
import re
import datetime
import time
import logging
import math

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

## Running List of Functions/Classes

### Classes

In [160]:
class SeparateTextFeatures(BaseEstimator, TransformerMixin):
        
    def __init__(self, text_cols=None):
        self.text_cols = text_cols
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_cols:
            return X.loc[:, self.text_cols]
        else:
            return X
        
class SeparateNumFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols=None):
        self.num_cols = num_cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_cols:
            return X.loc[:, self.num_cols]
        else:
            return X
        
class WilsonAverageTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_col=None, biz_list=None):
        self.num_col = num_col
        self.biz_list = biz_list
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_col and self.biz_list.all():
            scores = get_average_rating(X, self.biz_list)
            
            X_avg = pd.DataFrame({'average': scores})
            
            return X_avg
        else:
            return X

### Functions

In [153]:
def confidence(pos, neg):
    '''
    Calculates the Wilson confidence where pos is the number of positive ratings
    and neg is the number of negative ratings.
    '''
    n = pos + neg
    
    if n == 0:
        return 0
    z = 1.96 # 95% confidence interval
    phat = float(pos) / n
    return (((phat + z*z/(2*n) - z * np.sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)))

def get_average_rating(df, biz_list):
    '''
    Compiles the list of average ratings for each business in biz_list.
    '''
    wils_list = []
    
    for biz in biz_list:
        ind_biz = df[df['name'] == biz]
        
        pos_count = 0
        neg_count = 0
        
        for rating in ind_biz['rating']:
            if rating > 3:
                pos_count += 1
            else:
                neg_count += 1
        
        wils_conf = confidence(pos_count, neg_count)
        wils_list.append(wils_conf)
    return wils_list

## Data Preprocessing

### Load in data

In [129]:
df_init = joblib.load('../data/df_tot')

### Reorder columns

In [130]:
df_init = df_init[['name', 'rating' ,'reviews']]

### Change rating to int type

In [131]:
df_init['rating'] = df_init['rating'].apply(lambda x: int(x))

### Create holdout set

In [132]:
df_shuffled = df_init.sample(frac=1)

holdout_size = round(len(df_shuffled)*0.2)

df_holdout = df_shuffled.iloc[:holdout_size, :]
df_model = df_shuffled.iloc[holdout_size:, :]

print("Holdout Size: ", len(df_holdout))
print("CV Size: ", len(df_model))

Holdout Size:  40939
CV Size:  163757


### Sort the dataframe for cross-validation

In [133]:
df_model = df_model.sort()

  if __name__ == '__main__':


### Create transformers for splitting text and num columns

In [136]:
t = SeparateTextFeatures(['reviews'])
n = SeparateNumFeatures(['rating'])

print(t.transform(df_model).head())
print(n.transform(df_model).head())

                                             reviews
0  Hipster coffee galore! What happened to just a...
1  A pretty hipster, modern spacious place to do ...
2  Great place to grab a coffee and you can get t...
3  Coffee is amazing! Overpriced as usual. There'...
4  Big time coffee lover. First time customer and...
   rating
0       2
1       4
2       5
3       4
4       4


In [188]:
t.transform(df_model).shape
n.transform(df_model).shape

(163757, 1)

### Create transformer for averaging ratings for each business (may not need this since I might not average across the each business)

#### Get Wilson average for one business

In [141]:
one_biz = df_model[df_model['name'] == '0_FourBarrelCoffee']

In [142]:
pos_count = 0
neg_count = 0

for rating in one_biz['rating']:
    if rating > 3:
        pos_count += 1
    else:
        neg_count += 1
        
wil_conf = confidence(pos_count, neg_count)

#### Generalize to all businesses

In [147]:
biz_list = df_model['name'].unique()

In [150]:
wils_scores = get_average_rating(df_model, biz_list)

#### Create the transformer class

In [158]:
wat = WilsonAverageTransformer(num_col='rating', biz_list=biz_list)

In [159]:
wat.transform(df_model)

Unnamed: 0,average
0,0.709366
1,0.883937
2,0.797450
3,0.723511
4,0.838055
5,0.771723
6,0.719067
7,0.892038
8,0.567943
9,0.924839


### Create transformer for vectorizing each review

In [None]:
class VectorizeTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None, vectorizer=None):
        self.text_col = text_col
        self.vectorizer = vectorizer
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            scores = get_average_rating(X, self.biz_list)
            
            X_avg = pd.DataFrame({'average': scores})
            
            return X_avg
        else:
            return X

## Build the Pipeline

In [176]:
DB_baseline = Pipeline([
        ('combined_features', FeatureUnion([
                    
            ('split_num', SeparateNumFeatures(num_cols=['rating'])),
            ('text_vect', Pipeline([
                                
                ('split_text', SeparateTextFeatures(text_cols=['reviews'])),
                ('tfidf', TfidfVectorizer())
            ]))           
        ])),
        
        ('model', DBSCAN())
    ])

In [198]:
DB_baseline.fit(df_model.loc[:, ['rating','reviews']])

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 163757.

In [177]:
test = Pipeline([
        ('combined_features', FeatureUnion([
                    
            ('split_num', SeparateNumFeatures(num_cols=['rating'])),
            ('text_vect', Pipeline([
                                
                ('split_text', SeparateTextFeatures(text_cols=['reviews'])),
                ('tfidf', TfidfVectorizer())
            ]))           
        ]))
    ])

In [181]:
test.fit(df_model)

Pipeline(steps=[('combined_features', FeatureUnion(n_jobs=1,
       transformer_list=[('split_num', SeparateNumFeatures(num_cols=['rating'])), ('text_vect', Pipeline(steps=[('split_text', SeparateTextFeatures(text_cols=['reviews'])), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',...      token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))]))],
       transformer_weights=None))])

In [202]:
tr = test.transform(df_model.loc[:, ['rating','reviews']])

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 163757.

In [189]:
test2 = Pipeline([                
                ('split_text', SeparateTextFeatures(text_cols=['reviews'])),
                ('tfidf', TfidfVectorizer())
        ])

In [199]:
tr = test2.fit_transform(df_model.loc[:, ['rating','reviews']])

In [200]:
tr

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [201]:
df_model.shape

(163757, 3)