# Attempts at Classifiers Derived from Yelp Reviews Text Data
## Loading text data:
### (See comments in code)

In [1]:
import pandas as pd
from numpy import nan
import numpy as np
import scipy.stats as stats
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
violations = pd.read_pickle('data/violations_wtext.pkl')
violations = violations[[
        'date',            # Date of inspection 
        'delay',           # Days since last inspection
        'restaurant_id',   # Restaurant identifier
        '*', '**', '***',  # Number of violations at three severity levels
        'reviews',         # Aggregated text of all Yelp reviews from previous year
        'cool',            # Aggregated text of Yelp reviews voted "cool" by at least one user
        'funny',           # Aggregated text of Yelp reviews voted "funny" by at least one user
        'useful',          # Aggregated text of Yelp reviews voted "useful" by at least one user
        'tips'             # Aggregated text of all Yelp tips from previous year
    ]]
violations

Unnamed: 0,date,delay,restaurant_id,*,**,***,reviews,cool,funny,useful,tips
15836,2011-08-01,,NbE1Bk3J,8,1,0,Manger une pizza a pas chers c est la place qu...,Their name is similar to another pizza shop in...,This place is Trouble with a capital T. Every...,This place is Trouble with a capital T. Every...,Potato pizza rocks my socks Love the food & be...
1401,2011-08-01,,7RO5vjEq,7,1,0,Buffet joints aren't usually where you go to g...,"Bad, bad, bad food. If you're marketed as a bu...",The buffet is right there in the window. Look...,A friend told me that there was all-you-can-ea...,Not the very best Korean but they're the only ...
25218,2011-08-01,,V430D43B,1,0,2,Was told it was a 5 min wait for a table. They...,We descended upon Florentine Cafe one chilly n...,Good place to take a date or dine with friends...,We descended upon Florentine Cafe one chilly n...,Lobster ravioli is a house fav. bartenders are...
34217,2011-08-01,,Y1EmaVEw,7,0,1,Dropped in with a friend during one of their s...,I liked the hot pot. They have a nice variety....,I came here with my family on Columbus Day. I ...,Dropped in with a friend during one of their s...,"Make reservations Awesome food, friendly wait ..."
27586,2011-08-01,,6Wo2Nyo9,11,2,1,,,,,
33235,2011-08-01,,8x3zgYok,2,0,1,I like Au Bon Pain very much. Whether it's th...,"You know, it's a funny thing - I never even th...","When I mention Au Bon Pain, I EMPHASIZE THE PA...","When I mention Au Bon Pain, I EMPHASIZE THE PA...",
6368,2011-08-01,,8x3zx2Ok,5,0,2,We stayed at this hotel for 3 nights for the f...,We stayed at this hotel for 3 nights for the f...,This hotel is prime property. I mean if you co...,"Great location \nService was good\n\nWe were ""...",Suad is here!!! Champions for NCAA action... v...
21033,2011-08-01,,qN3gvnEA,5,0,1,It's a love/hate relationship. There are times...,It's always so crowed here during lunch time. ...,I've been here twice now.\nThe first time I wa...,It's always so crowed here during lunch time. ...,Cash only! But so damned worth it. The owner c...
21523,2011-08-01,,VpoG57Er,1,0,0,,,,,
6157,2011-08-01,,dj3dlN39,1,0,2,Get the teriyaki chicken with EXTRA chicken! ...,,,,


In [7]:
violations = violations[violations.delay > 14]

## Classifier Attempts:
### Naive Bayes using CountVectorizer features

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD

In [26]:
vectorizer = CountVectorizer(
    ngram_range = (1,2), 
    stop_words = 'english'
)

X = violations.reviews
y = (violations['*'] + violations['**'] + violations['***'] > 0).values.astype(np.int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.8, stratify = y)
Xtrain = vectorizer.fit_transform(Xtrain)

clf = MultinomialNB(alpha = 50)
clf.fit(Xtrain, ytrain)
print(
    'Training score: ' + str(clf.score(Xtrain, ytrain)) + '\n' +
    'Testing score: ' + str(clf.score(vectorizer.transform(Xtest), ytest)) + '\n' +
    'Proportion of inspections in test set that failed: ' + str(sum(ytest)/len(ytest))
    )

Training score: 0.656768672199
Testing score: 0.656817003629
Proportion of inspections in test set that failed: 0.656817003629


### Stochastic gradient descent using TF-IDF features

In [41]:
vectorizer = TfidfVectorizer(
    ngram_range = (1,2), 
    stop_words = 'english'
)

X = violations.reviews
y = (violations['*'] + violations['**'] + violations['***'] > 0).values.astype(np.int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.8, stratify = y)
Xtrain = vectorizer.fit_transform(Xtrain)

clf = SGDClassifier(n_jobs = -1)
clf.fit(Xtrain, ytrain)
print(
    'Training score: ' + str(clf.score(Xtrain, ytrain)) + '\n' +
    'Testing score: ' + str(clf.score(vectorizer.transform(Xtest), ytest)) + '\n' +
    'Proportion of inspections in test set that failed: ' + str(sum(ytest)/len(ytest))
    )

Training score: 0.834154564315
Testing score: 0.637117677553
Proportion of inspections in test set that failed: 0.656817003629


### Random Forest using a selection of 1000 TF-IDF features

In [43]:
vectorizer = TfidfVectorizer(
    ngram_range = (1,2), 
    stop_words = 'english',
    max_features = 1000
)

X = violations.reviews
y = (violations['*'] + violations['**'] + violations['***'] > 0).values.astype(np.int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.8, stratify = y)
Xtrain = vectorizer.fit_transform(Xtrain)

gs = GridSearchCV(
        estimator = RandomForestClassifier(n_estimators = 500),
        param_grid = {
            'min_samples_leaf' : np.arange(1, 30, 3),
            'max_features' : np.arange(10, 40, 5)
        },
        n_jobs = -1
)
gs.fit(Xtrain, ytrain)
clf = gs.best_estimator_

print(
    'Training score: ' + str(clf.score(Xtrain, ytrain)) + '\n' +
    'Testing score: ' + str(clf.score(vectorizer.transform(Xtest), ytest)) + '\n' +
    'Proportion of inspections in test set that failed: ' + str(sum(ytest)/len(ytest))
    )

Training score: 0.772821576763
Testing score: 0.666666666667
Proportion of inspections in test set that failed: 0.656817003629


### SVM using a selection of 1000 TF-IDF features

In [44]:
vectorizer = TfidfVectorizer(
    ngram_range = (1,2), 
    stop_words = 'english',
    max_features = 1000
)

X = violations.reviews
y = (violations['*'] + violations['**'] + violations['***'] > 0).values.astype(np.int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.8, stratify = y)
Xtrain = vectorizer.fit_transform(Xtrain)

gs = GridSearchCV(
        estimator = SVC(probability = True),
        param_grid = {
            'C' : np.logspace(-3, 2, 6),
            'gamma' : np.logspace(-2, 0, 5)
        },
        n_jobs = -1
)
gs.fit(Xtrain, ytrain)
clf = gs.best_estimator_

print(
    'Training score: ' + str(clf.score(Xtrain, ytrain)) + '\n' +
    'Testing score: ' + str(clf.score(vectorizer.transform(Xtest), ytest)) + '\n' +
    'Proportion of inspections in test set that failed: ' + str(sum(ytest)/len(ytest))
    )

Training score: 0.747406639004
Testing score: 0.662519440124
Proportion of inspections in test set that failed: 0.656817003629


### SVM using *all* TF-IDF features, but reduced to 100 dimensions

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range = (1,2), 
    stop_words = 'english',
)
svd = TruncatedSVD(n_components = 100)

X = violations.reviews
y = (violations['*'] + violations['**'] + violations['***'] > 0).values.astype(np.int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.8, stratify = y)
Xtrain = vectorizer.fit_transform(Xtrain)
Xtrain = svd.fit_transform(Xtrain)

gs = GridSearchCV(
        estimator = SVC(probability = True),
        param_grid = {
            'C' : np.logspace(-3, 2, 6),
            'gamma' : np.logspace(-2, 0, 5)
        },
        n_jobs = -1
)
gs.fit(Xtrain, ytrain)
clf = gs.best_estimator_

print(
    'Training score: ' + str(clf.score(Xtrain, ytrain)) + '\n' +
    'Testing score: ' + str(clf.score(svd.transform(vectorizer.transform(Xtest)), ytest)) + '\n' +
    'Proportion of inspections in test set that failed: ' + str(sum(ytest)/len(ytest))
    )