# Modeling Yelp Reviews of Cafes Near San Francisco

Paul Lim

## Libraries

In [None]:
# Main imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# sklearn
from sklearn.pipeline import make_pipeline
from sklearn import pipeline, feature_selection, decomposition
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cluster import DBSCAN, AgglomerativeClustering, Birch
from sklearn.decomposition import PCA, NMF
from sklearn.metrics import silhouette_score

# NLP 
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
import gensim
from gensim import models
from gensim.models import word2vec
import snowballstemmer

# Misc.
import re
import datetime
import time
import logging
import math

% matplotlib inline

sns.set_style("white")
sns.set_style('ticks')
sns.set_style({'xtick.direction': u'in', 'ytick.direction': u'in'})
sns.set_style({'legend.frameon': True})

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Running List of Functions/Classes

### Classes

In [None]:
class DataframeToSeriesTransformer(BaseEstimator, TransformerMixin):
        
    def __init__(self, col=None):
        self.col = col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.col:
            print("DTST: ", X[self.col].shape)
            return X[self.col]
        else:
            return X
        
class SeparateFeaturesTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_cols=None, text_cols=None):
        self.num_cols = num_cols
        self.text_cols = text_cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.num_cols:
            print("SFT: ", X.loc[:, self.num_cols].shape)
            return X.loc[:, self.num_cols]
        elif self.text_cols:
            print("SFT: ", X.loc[:, self.text_cols].shape)
            return X.loc[:, self.text_cols]
        else:
            return X
        
class CleanTextTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        X_list = X.loc[:, self.text_col].tolist()
        
        if self.text_col:
            df = pd.DataFrame()
            clean_review_list = []
            
            for review in X_list:
                clean_review = ''
                
                for word in TextBlob(review).words:
                    clean_review += word.lemmatize() + ' '
                        
                clean_review_list.append(clean_review)
                        
            df['clean_reviews'] = clean_review_list
            print("CTT: ", df.shape)
            return df
        else:
            return X
        
class DensifyTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X.toarray())
        print("DT: ", df.shape)
        return df
    
class SentimentTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = pd.DataFrame()
            sum_pol_list = []
            sum_sub_list = []
            doc_list = X.loc[:, self.text_col].tolist()

            for doc in doc_list:
                sum_pol = 0
                sum_sub = 0
                doc_blob = TextBlob(doc)

                for sent in doc_blob.sentences:
                    sum_pol += sent.sentiment[0]
                    sum_sub += sent.sentiment[1]

                sum_pol_list.append(sum_pol)
                sum_sub_list.append(sum_sub)

            df['pol'] = sum_pol_list
            df['sub'] = sum_sub_list
            df['clean_reviews'] = X.loc[:, self.text_col] # Need to keep the clean reviews for the W2V transformer.
            print("ST: ", df.shape)
            return df
        else:
            return X

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None, w2v=None):
        self.text_col = text_col
        self.w2v = w2v
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            avg_w2v_list = []
            review_list = X.loc[:, self.text_col].tolist()
            
            for review in review_list:
                avg_w2v = np.zeros(300)
                count = 0
                
                for word in review:
                    try:
                        avg_w2v += w2v.word_vec(word)
                        count += 1
                    except Exception:
                        continue

                avg_w2v_list.append(avg_w2v/count)
            df = pd.DataFrame(avg_w2v_list)
            print("W2V: ", df.shape)
            return df
        else:
            return X
        
class ToDataFrameTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        df = pd.DataFrame(X)
        print("TDFT: ", df.shape)
        return df
        
class DropTextTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, text_col=None):
        self.text_col = text_col
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        if self.text_col:
            df = X.drop(self.text_col, axis=1)
            print("DTT: ", df.shape)
            return df

### Functions

In [None]:
def clean_df():
    '''
    DESCRIBE:
        - Preprocesses the data.
    INPUT:
        - df is the dataframe that needs to be cleaned.
    OUTPUT:
        - The dataframe that is outputted has the columns reordered and data types changed.
    '''
    if os.path.isfile(PATH_TO_DATA):
        df = joblib.load(PATH_TO_DATA)
    else:
        print("Invalid path to data")
        return False
    
    df = df[['name', 'rating' ,'reviews']]
    df['rating'] = df['rating'].apply(lambda x: int(x))
    return df

In [None]:
def fit_model(pipe, model, df_orig):
    '''
    DESCRIBE:
        - Fit the model through the pipeline and get scoring metrics for the model.
    INPUT:
        - pipe is the pipeline to run the data through.
        - model is the model object that will be used to fit the data.
        - df_orig is the data.
    OUTPUT:
        - df_transformed is the dataframe that is outputted from the pipeline.
        - pred is the predictions for the data for this particular model.
    '''
    df_transformed = pipe.fit_transform(df_orig)
    pred = model.fit_predict(df_transformed)
    print("Number of Clusters: ", len(np.unique(pred)))
    if len(np.unique(model.labels_)) > 1:
        print("Silhouette Coefficient: %0.3f" % silhouette_score(df_transformed, model.labels_))
    return df_transformed, pred

In [None]:
def model_metrics(model_dict, pipe, df):
    '''
    DESCRIBE:
        - Fits a dictionary of models through the pipeline and get scoring metrics for the models.
    INPUT:
        - pipe is the pipeline to run the data through.
        - model_dict is a dictionary of the model objects that will be used to fit the data.
        - df is the data.
    OUTPUT:
        - model_dfs contains the transformed dataframe and scoring metric for each model.
    '''
    model_dfs = {}
    for name, model in model_dict.items():
        print(name)
        temp_df, temp_score = fit_model(pipe, model, df)
        model_dfs[name] = [temp_df, temp_score]
    return model_dfs

In [None]:
def add_feature_space(df, transformed_df, path):
    '''
    DESCRIBE:
        - Adds the feature space that was produced by the W2V model to the original dataframe.
    INPUT:
        - df is the data.
        - transformed_df is the dataframe containing the transformed data.
        - path where the dataframe should be saved.
    OUTPUT:
        - df_out is the dataframe with the new features appended.
    '''
    df_out = pd.DataFrame(transformed_df, columns=['rating'] + [i for i in range(1,301)])
    df_out['name'] = df_out['name'].tolist()
    df_out['reviews'] = df_out['reviews'].tolist()
    
    if not os.path.isdir(path):
        os.mkdir(path)

    joblib.dump(df_out, os.path.join(path, str(df_out)))
    
    return df_out

## Data Processing

In [None]:
global PATH_TO_DATA = '../data/df_tot'
global w2v = models.KeyedVectors.load_word2vec_format("~/Documents/GoogleNews-vectors-negative300.bin.gz",binary=True)

In [None]:
df = clean_df()

pipe = Pipeline([
                    ('text_feat', Pipeline([

                        ('split_text', SeparateFeaturesTransformer(text_cols=['reviews'])),
                        ('clean', CleanTextTransformer('reviews')),
                        ('sentiment', SentimentTransformer(text_col='clean_reviews')),
                        ('vectorize', Word2VecTransformer(text_col='clean_reviews', w2v=w2v))
                                            ]))
                    ])

model_dict = {
    "agg": AgglomerativeClustering(n_clusters=5,
                                   affinity="cosine",
                                   linkage="complete"),
    "birch": Birch(threshold=0.5,
                   n_clusters=5,
                   branching_factor=50),
    "db": DBSCAN(eps=0.5,
                 min_samples=10,
                 metric="euclidean")
}

model_metrics_dict = model_metrics(model_dict, pipe, df)
df_out = add_feature_space(df, model_metrics_dict_W2V['birch'][0], '../data/')