In [2]:

import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
import metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import text_processors
from progressbar import ProgressBar
import data_grab
from time import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack


In [3]:
def contest_metric(numpy_array_predictions, numpy_array_actual_values):
    return metrics.weighted_rmsle(numpy_array_predictions, numpy_array_actual_values,
            weights=metrics.KEEPING_IT_CLEAN_WEIGHTS)

In [4]:
def contest_scoring(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    s1 = pipeline.fit(X_train, y_train['score_lvl_1']).predict(X_test)
    s2 = pipeline.fit(X_train, y_train['score_lvl_2']).predict(X_test)
    s3 = pipeline.fit(X_train, y_train['score_lvl_3']).predict(X_test)
    results = np.dstack((s1, s2, s3))
    score = contest_metric(np.round(results[0]), np.array(y_test))
    print("Contest score of {}".format(score))
    return score

In [5]:
from sklearn.metrics import accuracy_score

def raw_scoring(p1, p2, p3, ytrue):
    '''since cross_val_score doesn't allow you to round the results beforehand. also for pymc3 and other non-sklearn models'''
    score1 = accuracy_score(ytrue['score_lvl_1'], np.round(p1))
    print("Level 1 accuracy score of {}".format(score1))
    score2 = accuracy_score(ytrue['score_lvl_2'], np.round(p2))
    print("Level 2 accuracy score of {}".format(score2))
    score3 = accuracy_score(ytrue['score_lvl_3'], np.round(p3))
    print("Level 3 accuracy score of {}".format(score3))
    
    results = np.dstack((p1, p2, p3))[0]
    score = contest_metric(np.round(results), np.array(ytrue))
    print("Contest score of {}".format(score))
    
    rounded = np.clip(np.round(results), 0, np.inf)
    compare = pd.concat([pd.DataFrame(np.concatenate((results, rounded), axis=1)), ytrue.reset_index(drop=True)], axis=1)
    compare.columns = ['pred1','pred2','pred3','round1','round2','round3','true1','true2','true3']
    compare['offset1'] = compare.round1-compare.true1
    compare['offset2'] = compare.round2-compare.true2
    compare['offset3'] = compare.round3-compare.true3
        
    return score1, score2, score3, score, compare.head(10)

    
def raw_fit(X, y, pipeline):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
    
    p1 = pipeline.fit(xtrain, ytrain['score_lvl_1']).predict(xtest)
    p2 = pipeline.fit(xtrain, ytrain['score_lvl_2']).predict(xtest)
    p3 = pipeline.fit(xtrain, ytrain['score_lvl_3']).predict(xtest)
        
    return p1, p2, p3, ytest

In [6]:
def extract_features(df):
    features = df.drop(['score_lvl_1', 'score_lvl_2', 'score_lvl_3'], axis=1)
    response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.int8)
    
    return features, response

In [7]:
df = pd.read_pickle('pickle_jar/review_text_sentiment_hierarchical_df')
# prep = pd.read_pickle('pickle_jar/preprocessed_review_text_hierarchical_df')
# df = pd.concat([df, prep.preprocessed_review_text], axis=1)
sim = pd.read_pickle('pickle_jar/similarity_vectors_df')
# tfidf = joblib.load('pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_hierarchical_dropna')
# matrix = joblib.load('pickle_jar/similarity_matrix')

In [8]:
df.previous_inspection_delta = df.previous_inspection_delta.fillna(0)
df.previous_inspection_delta = df.previous_inspection_delta.dt.days.astype(float)

In [9]:
def get_out(x):
    try:
        return x[0]
    except:
        return x


topics = ['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
pbar = ProgressBar(maxval=len(topics)).start()
for index, i in enumerate(topics):
    sim[i] = sim[i].apply(get_out)
    pbar.update(index)
pbar.finish()

100% (53 of 53) |#########################| Elapsed Time: 0:03:42 Time: 0:03:42


In [10]:
df = pd.concat([df,sim[topics]],axis=1)

In [39]:
train = data_grab.get_selects('train')

In [21]:
dropped = df.dropna(subset=['review_text'])

In [124]:
dropped['user_yelping_since_delta'] = (dropped.review_date - dropped.user_yelping_since).astype('timedelta64[D]')

In [136]:
dropped['user_most_recent_elite_year_delta'] = (dropped.review_date.dt.year - dropped.user_most_recent_elite_year)

In [137]:
dropped['restaurant_categories'] = train.restaurant_categories.apply(lambda x: sorted(x))
dropped['restaurant_neighborhoods'] = train.restaurant_neighborhoods.apply(lambda x: sorted(x))

In [212]:
dropped.drop(['review_text', 'review_date', 'user_id', 'restaurant_full_address', 'restaurant_name',
         'inspection_date', 'inspection_id', 'inspection_date', 'sentiment', 'vader'], axis=1, inplace=True)
dropped.drop(['restaurant_neighborhood_1',
 'restaurant_neighborhood_2',
 'restaurant_neighborhood_3',
 'restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7',], axis=1, inplace=True)
dropped.drop(['user_yelping_since', 'restaurant_attributes_by_appointment_only', 'restaurant_open', 'user_most_recent_elite_year'] , axis=1, inplace=True)
dropped.drop(['review_year',
 'review_month',
 'review_day',
 'review_dayofweek',
 'review_quarter',
 'review_dayofyear',
 'inspection_dayofyear',], axis=1, inplace=True)

In [213]:
dropped['review_stars'] = dropped.review_stars.fillna(0).astype('category')
dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer', 'checkin_counts']] = dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer', 'checkin_counts']].fillna(0)
dropped[['restaurant_attributes_ages_allowed',
         'restaurant_attributes_alcohol', 
         'restaurant_attributes_attire', 
         'restaurant_attributes_byob_corkage', 
         'restaurant_attributes_noise_level', 
         'restaurant_attributes_smoking', 
         'restaurant_attributes_wifi']] = dropped[['restaurant_attributes_ages_allowed',
                                                   'restaurant_attributes_alcohol', 
                                                   'restaurant_attributes_attire', 
                                                   'restaurant_attributes_byob_corkage', 
                                                   'restaurant_attributes_noise_level', 
                                                   'restaurant_attributes_smoking', 
                                                   'restaurant_attributes_wifi']].convert_objects().fillna('nan')
dropped[['restaurant_hours_friday_close',
 'restaurant_hours_friday_open',
 'restaurant_hours_monday_close',
 'restaurant_hours_monday_open',
 'restaurant_hours_saturday_close',
 'restaurant_hours_saturday_open',
 'restaurant_hours_sunday_close',
 'restaurant_hours_sunday_open',
 'restaurant_hours_thursday_close',
 'restaurant_hours_thursday_open',
 'restaurant_hours_tuesday_close',
 'restaurant_hours_tuesday_open',
 'restaurant_hours_wednesday_close',
 'restaurant_hours_wednesday_open']] = dropped[[ 'restaurant_hours_friday_close',
 'restaurant_hours_friday_open',
 'restaurant_hours_monday_close',
 'restaurant_hours_monday_open',
 'restaurant_hours_saturday_close',
 'restaurant_hours_saturday_open',
 'restaurant_hours_sunday_close',
 'restaurant_hours_sunday_open',
 'restaurant_hours_thursday_close',
 'restaurant_hours_thursday_open',
 'restaurant_hours_tuesday_close',
 'restaurant_hours_tuesday_open',
 'restaurant_hours_wednesday_close',
 'restaurant_hours_wednesday_open']].convert_objects().fillna('nan')
dropped[['restaurant_ambience',
         'restaurant_music',
         'restaurant_parking',
         'restaurant_zipcode']] = dropped[['restaurant_ambience',
                                            'restaurant_music',
                                            'restaurant_parking',
                                            'restaurant_zipcode']].convert_objects().fillna('nan')
dropped[['manager',
 'supervisor',
 'training',
 'safety',
 'disease',
 'ill',
 'sick',
 'poisoning',
 'hygiene',
 'raw',
 'undercooked',
 'cold',
 'clean',
 'sanitary',
 'wash',
 'jaundice',
 'yellow',
 'hazard',
 'inspection',
 'violation',
 'gloves',
 'hairnet',
 'nails',
 'jewelry',
 'sneeze',
 'cough',
 'runny',
 'illegal',
 'rotten',
 'dirty',
 'mouse',
 'cockroach',
 'contaminated',
 'gross',
 'disgusting',
 'stink',
 'old',
 'parasite',
 'reheat',
 'frozen',
 'broken',
 'drip',
 'bathroom',
 'toilet',
 'leak',
 'trash',
 'dark',
 'lights',
 'dust',
 'puddle',
 'pesticide',
 'bugs',
 'mold',]] = dropped[['manager',
 'supervisor',
 'training',
 'safety',
 'disease',
 'ill',
 'sick',
 'poisoning',
 'hygiene',
 'raw',
 'undercooked',
 'cold',
 'clean',
 'sanitary',
 'wash',
 'jaundice',
 'yellow',
 'hazard',
 'inspection',
 'violation',
 'gloves',
 'hairnet',
 'nails',
 'jewelry',
 'sneeze',
 'cough',
 'runny',
 'illegal',
 'rotten',
 'dirty',
 'mouse',
 'cockroach',
 'contaminated',
 'gross',
 'disgusting',
 'stink',
 'old',
 'parasite',
 'reheat',
 'frozen',
 'broken',
 'drip',
 'bathroom',
 'toilet',
 'leak',
 'trash',
 'dark',
 'lights',
 'dust',
 'puddle',
 'pesticide',
 'bugs',
 'mold',]].fillna(0)
dropped.user_most_recent_elite_year_delta = dropped.user_most_recent_elite_year_delta.fillna(dropped.user_most_recent_elite_year_delta.median())

In [214]:
dropped = dropped.dropna(subset=['restaurant_attributes_price_range'])

In [216]:
# dropped = pd.read_pickle('pickle_jar/final_dropped')
# dropped.to_pickle('pickle_jar/final_dropped')

In [215]:
dropped.shape

(1924336, 167)

In [143]:
dropped.columns.tolist()

['restaurant_id',
 'review_stars',
 'review_votes_cool',
 'review_votes_funny',
 'review_votes_useful',
 'user_average_stars',
 'user_compliments_cool',
 'user_compliments_cute',
 'user_compliments_funny',
 'user_compliments_hot',
 'user_compliments_list',
 'user_compliments_more',
 'user_compliments_note',
 'user_compliments_photos',
 'user_compliments_plain',
 'user_compliments_profile',
 'user_compliments_writer',
 'user_fans',
 'user_name',
 'user_review_count',
 'user_votes_cool',
 'user_votes_funny',
 'user_votes_useful',
 'restaurant_stars',
 'restaurant_attributes_accepts_credit_cards',
 'restaurant_attributes_ages_allowed',
 'restaurant_attributes_alcohol',
 'restaurant_attributes_attire',
 'restaurant_attributes_byob',
 'restaurant_attributes_byob_corkage',
 'restaurant_attributes_caters',
 'restaurant_attributes_coat_check',
 'restaurant_attributes_corkage',
 'restaurant_attributes_delivery',
 'restaurant_attributes_dietary_restrictions_dairy_free',
 'restaurant_attributes_d

In [145]:
dropped.shape

(1924336, 168)

In [169]:
dropped.user_most_recent_elite_year_delta.dropna().shape

(759039,)

In [178]:
dropped.user_most_recent_elite_year_delta.value_counts(dropna=False, sort=True)

TypeError: unhashable type: 'list'

In [63]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer

In [176]:
cats = []
for i in ['restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

def proper_array(x, backfill_size=7):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return zeros

t = dropped.restaurant_categories.apply(proper_array)

enc = OneHotEncoder(sparse=True)
e = enc.fit_transform(np.vstack(t))

In [208]:
cats = []
for i in ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

t = dropped.restaurant_neighborhoods.apply(proper_array, args=(3,))

enc = OneHotEncoder(sparse=True)
e = enc.fit_transform(np.vstack(t))

def proper_array(x, backfill_size=3):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return temp


In [64]:
enc_label = LabelEncoder()
# el = enc_label.fit_transform(np.vstack(t)[:,0])
# el = enc_label.fit_transform(dropped.restaurant_id)

In [67]:
enc_label.fit_transform(dropped.restaurant_attributes_accepts_credit_cards)

array([1, 1, 1, ..., 1, 1, 1])

In [11]:
y = dropped[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

<1923536x10881 sparse matrix of type '<type 'numpy.int64'>'
	with 1923536 stored elements in Compressed Sparse Row format>

In [16]:

# def get_out(x):
#     try:
#         return x[0]
#     except:
#         return x

# topics = ['supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
# matrix = np.vstack(test.manager.apply(lambda x: x[0:2]))
# pbar = ProgressBar(maxval=len(topics)).start()
# for index, i in enumerate(topics):
#     t = np.vstack(test[i].apply(lambda x: x[0:2]))
#     matrix = np.concatenate((matrix, t), axis=1)
#     pbar.update(index)
# pbar.finish()
# matrix = csr_matrix(matrix)
# X = hstack([matrix, test[['review_delta', 'previous_inspection_delta']]])
# y = test[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

In [24]:
matrix = joblib.load('pickle_jar/similarity_matrix5')

In [111]:
def make_bins(df, bin_size=10):
    # time delta bins
    tdmax = df.review_delta.max()
    tdmin = df.review_delta.min()
    df['review_delta_bin'] = pd.cut(df["review_delta"], np.arange(tdmin, tdmax, bin_size))
    df['review_delta_bin_codes'] = df.review_delta_bin.astype('category').cat.codes
    tdmax = df.previous_inspection_delta.max()
    tdmin = df.previous_inspection_delta.min()
    df['previous_inspection_delta_bin'] = pd.cut(df["previous_inspection_delta"], np.arange(tdmin-1, tdmax, bin_size))
    df['previous_inspection_delta_bin_codes'] = df.previous_inspection_delta_bin.astype('category').cat.codes
    return df
df = make_bins(df)

In [34]:
scores = ['score_lvl_1', 'score_lvl_2', 'score_lvl_3']
# model_features = ['review_stars', 'review_delta', 'previous_inspection_delta','review_delta_bin', 'previous_inspection_delta_bin', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
# model_features = ['review_stars', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
model_features = ['review_delta', 'previous_inspection_delta']

X, y = extract_features(test[model_features + topics + scores].dropna())

print df.shape
print X.shape
print y.shape
# review_stars doesnt exist for every observation so reducing size even further

(4071065, 185)
(1923536, 55)
(1923536, 3)


In [None]:
# from sklearn.feature_selection import RFECV
# rfecv = RFECV(estimator=SGDClassifier(n_jobs=-1), scoring=mean_squared_error)
# rfecv.fit(X, y.score_lvl_1)

from sklearn.metrics import mean_squared_error

# from sklearn.svm import LinearSVC
# X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, y.score_lvl_1)

# from sklearn.feature_selection import RFE
# rfe = RFE(estimator=LinearRegression(), n_features_to_select=3, step=1)
# rfe.fit(X, y.score_lvl_1)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
test = SelectFwe(f_classif).fit_transform(X, y.score_lvl_1)

In [86]:
X_new = SelectKBest(f_classif, k=5).fit_transform(X, y.score_lvl_1)

In [20]:
X_new

NameError: name 'X_new' is not defined

In [None]:
t0 = time()

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB


# set classifiers to test
estimator = LinearRegression()
# estimator = RandomForestClassifier(n_jobs=-1, random_state=42)
# estimator = SGDClassifier(n_jobs=-1, random_state=42)
# estimator = Perceptron(n_jobs=-1, random_state=42)  # gets some nuances
# estimator = SGDRegressor() # gets some nuances
# estimator = KNeighborsClassifier()
# estimator = KNeighborsRegressor()  # gets some nuances
# estimator = DecisionTreeClassifier()
# estimator = DecisionTreeRegressor()
# estimator = GaussianNB()

pipeline = Pipeline([
        ('low_var_removal', VarianceThreshold()),
        ('normalizer', Normalizer()),
#         ('normalizer', Normalizer(norm='l2')), #  for text classification and clustering
#         ('scaler', StandardScaler()),
#         ('scaler', StandardScaler(with_mean=False)), #  for sparse matrix
        ('clf', estimator),
])

p1,p2,p3,ytest = raw_fit(t1, y, pipeline)
raw_scoring(p1,p2,p3,ytest)


print("{} seconds elapsed".format(time()-t0))

# first representation of manager and mold plus deltas for SGDClassifier
# Level 1 accuracy score of 0.160254863959
# Level 2 accuracy score of 0.693050714933
# Level 3 accuracy score of 0.574146779681
# Contest score of 1.52839121903

# sparse matrix of just manager no-mean-false and the detla for sdg
# Level 1 accuracy score of 0.136340572778
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.41908388602

# first representation of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.0958193660009
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.2018434011

# first two representations of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.134818376157
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.73133292154

# first five representations of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.140464228379
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.43479163855

In [74]:
# baseline scores if guessing zero
guess = 0
for index, score in enumerate(scores):
    print("level {}: {}".format(index+1, y[score].value_counts(normalize=True)[guess]))

level 1: 0.22128560699
level 2: 0.692799495547
level 3: 0.573199692093
