In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
import metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from progressbar import ProgressBar
import data_grab
from time import time
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_selection import VarianceThreshold



In [2]:
def contest_metric(numpy_array_predictions, numpy_array_actual_values):
    return metrics.weighted_rmsle(numpy_array_predictions, numpy_array_actual_values,
            weights=metrics.KEEPING_IT_CLEAN_WEIGHTS)

In [3]:
def contest_scoring(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    s1 = pipeline.fit(X_train, y_train['score_lvl_1']).predict(X_test)
    s2 = pipeline.fit(X_train, y_train['score_lvl_2']).predict(X_test)
    s3 = pipeline.fit(X_train, y_train['score_lvl_3']).predict(X_test)
    results = np.dstack((s1, s2, s3))
    score = contest_metric(np.round(results[0]), np.array(y_test))
    print("Contest score of {}".format(score))
    return score

In [4]:
from sklearn.metrics import accuracy_score

def raw_scoring(p1, p2, p3, ytrue):
    '''since cross_val_score doesn't allow you to round the results beforehand. also for pymc3 and other non-sklearn models'''
    score1 = accuracy_score(ytrue['score_lvl_1'], np.clip(np.round(p1), 0, np.inf))
    print("Level 1 accuracy score of {}".format(score1))
    score2 = accuracy_score(ytrue['score_lvl_2'],np.clip(np.round(p2), 0, np.inf))
    print("Level 2 accuracy score of {}".format(score2))
    score3 = accuracy_score(ytrue['score_lvl_3'], np.clip(np.round(p3), 0, np.inf))
    print("Level 3 accuracy score of {}".format(score3))
    
    results = np.dstack((p1, p2, p3))[0]
    rounded = np.clip(np.round(results), 0, np.inf)
    score = contest_metric(rounded, np.array(ytrue))
    print("Contest score of {}".format(score))
    
    compare = pd.concat([pd.DataFrame(np.concatenate((results, rounded), axis=1)), ytrue.reset_index(drop=True)], axis=1)
    compare.columns = ['pred1','pred2','pred3','round1','round2','round3','true1','true2','true3']
    compare['offset1'] = compare.round1-compare.true1
    compare['offset2'] = compare.round2-compare.true2
    compare['offset3'] = compare.round3-compare.true3
        
    return score1, score2, score3, score, compare.head(10)

    
def raw_fit(X, y, pipeline):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
    
    p1 = pipeline.fit(xtrain, ytrain['score_lvl_1']).predict(xtest)
    p2 = pipeline.fit(xtrain, ytrain['score_lvl_2']).predict(xtest)
    p3 = pipeline.fit(xtrain, ytrain['score_lvl_3']).predict(xtest)
        
    return p1, p2, p3, ytest

In [5]:
def extract_features(df):
    features = df.drop(['score_lvl_1', 'score_lvl_2', 'score_lvl_3'], axis=1)
    response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.int8)
    
    return features, response

In [6]:
combo = pd.read_pickle('pickle_jar/pre-pivot_all_review_combo_365')
df = pd.read_pickle("pickle_jar/non_review_df_365")

In [160]:
df = data_grab.get_selects('train')

In [161]:
# not getting rid of everything. not getting rid of ancient reviews if they correspond with an ancient inspection.
# just getting rid of ancient reviews in relation to a specific inspection
df = df[df.review_delta <= 365]

In [162]:
# making a column that enumerates the sorted review delta.
# removes any nans (usually marked as such because the reviews happend after an inspection date) beforehand.
# this can completely remove a restaurant if it has no reviews left after nan drop. might add back in later
df = df.sort('review_delta')
df2 = df.dropna(subset=['review_delta']).groupby('inspection_id').cumcount()

In [163]:
# df3 = pd.concat([df.dropna(subset=['review_delta']), df2], axis=1)
df = pd.concat([df.dropna(subset=['review_delta']), df2], axis=1)

In [164]:
df = df.rename(columns={0:'enumerated_review_delta'})
del df2

In [165]:
# add back in the restaurants that were completely removed because none of their reviews were
# not a significant loss in the number of restaurants. going to just leave it out for now
# mask = list(set(df.restaurant_id) - set(df3.restaurant_id))
# pd.concat([df[df.restaurant_id.isin(mask)], df3])

In [166]:
df.shape

(500936, 137)

In [167]:
just_review_data = df[[ 
 'inspection_id',
 'enumerated_review_delta',
#  'review_id',
 'review_delta', 
#  'review_date', 
 'review_stars',
 'review_text',
#  'user_id',
 'review_votes_cool',
 'review_votes_funny',
 'review_votes_useful',
 'user_average_stars',
#  'user_fans',
 'user_review_count',
#  'user_votes_cool',
#  'user_votes_funny',
#  'user_votes_useful',
 'user_yelping_since_delta',
#  'review_year',
#  'review_month',
#  'review_day',
#  'review_dayofweek',
#  'review_quarter',
#  'review_dayofyear',
 'user_ever_elite',
 ]]

In [168]:
# this removes all the nans left. review stars as nans because reviews and tips combined and tips didnt give a star rating
# planning to treat this as a categorical variable later so making it zero shouldnt affect anything
just_review_data.review_stars = just_review_data.review_stars.fillna(0)

In [28]:
just_review_data.to_pickle('pickle_jar/pre-pivot_365')

In [127]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer
lbl_enc = LabelEncoder()
# df['restaurant_id_enc'] = lbl_enc.fit_transform(df.restaurant_id)

In [122]:
sentiment = pd.read_pickle('pickle_jar/review_text_sentiment_pivot')
tfidf = joblib.load('pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_pivot')
similarity = pd.read_pickle('pickle_jar/similarity_vectors_pivot')

In [132]:
sentiment = sentiment[['inspection_id', 'enumerated_review_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound']]

In [137]:
similarity.drop(['review_delta', 'review_stars', 'review_text', 'review_votes_cool', 'review_votes_funny',
                 'review_votes_useful', 'user_average_stars', 'user_fans', 'user_review_count', 'user_votes_cool', 
                 'user_votes_funny', 'user_votes_useful', 'user_yelping_since_delta', 'user_ever_elite',
                 'preprocessed_review_text'], axis=1, inplace=True)

In [159]:
combo = pd.merge(sentiment, similarity, on=['inspection_id', 'enumerated_review_delta'])

In [170]:
just_review_data.drop('review_text', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [173]:
combo = pd.merge(combo, just_review_data, on=['inspection_id', 'enumerated_review_delta'])

In [174]:
combo.to_pickle('pickle_jar/pre-pivot_all_review_combo_365')

In [198]:
df.user_ever_elite.value_counts(dropna=False)

False    334845
True     166091
dtype: int64

In [199]:
df['review_stars'] = df.review_stars.fillna(0).astype('category')
df.restaurant_attributes_price_range = df.restaurant_attributes_price_range.fillna(df.restaurant_attributes_price_range.median())

In [206]:
df.drop(['review_text', 'review_date', 'user_id', 'restaurant_full_address', 'restaurant_name',
         'inspection_date', 'inspection_id', 'inspection_date',  
         'restaurant_attributes_by_appointment_only', 'restaurant_open',
        'review_year',
 'review_month',
 'review_day',
 'review_dayofweek',
 'review_quarter',
 'review_dayofyear',
 'inspection_dayofyear'], axis=1, inplace=True)

df.drop(['restaurant_neighborhoods', 'restaurant_categories'], axis=1, inplace=True)

In [207]:
df.to_pickle("pickle_jar/non_review_df_365")

#combine the user average stars and user review count etc into a trustworthiness variable

In [176]:
combo.sort(['inspection_id', 'enumerated_review_delta']).set_index(['inspection_id', 'enumerated_review_delta']).T

inspection_id,0,0,1,1,1,2,2,2,2,2,...,33858,33858,33858,33858,33858,33858,33858,33858,33858,33858
enumerated_review_delta,0,1,0,1,2,0,1,2,3,4,...,22,23,24,25,26,27,28,29,30,31
polarity,0.2229464,0.2376021,0.1899811,0.1151515,0.3242593,0.005024631,-0.2342262,0.7,0.1581019,0.01955128,...,0.3269231,0.3566667,0.6138889,0.2632275,0.2440449,0.58,0.58125,0.359987,0.23,0.2742187
subjectivity,0.7133929,0.4959382,0.5185783,0.5075758,0.5774074,0.6181281,0.6309524,0.6,0.462963,0.7005495,...,0.6929487,0.5005556,0.775,0.5545855,0.6175108,0.64,0.655,0.7594444,0.52,0.7005208
neg,0,0.008,0,0.03,0,0.054,0.152,0,0.134,0.044,...,0.041,0,0,0.047,0.02,0,0,0,0.086,0.011
neu,0.783,0.839,0.853,0.776,0.686,0.759,0.79,0.562,0.585,0.802,...,0.737,0.694,0.695,0.839,0.795,0.674,0.665,0.772,0.682,0.661
pos,0.217,0.153,0.147,0.194,0.314,0.187,0.058,0.438,0.28,0.154,...,0.222,0.306,0.305,0.114,0.185,0.326,0.335,0.228,0.232,0.327
compound,0.9636,0.9926,0.9537,0.9421,0.9716,0.9871,-0.8859,0.6662,0.8813,0.9217,...,0.9789,0.9769,0.9878,0.9037,0.9872,0.8918,0.9781,0.9188,0.4927,0.991
manager,"[0.116296313703, 0.116296313703, 0.09853885322...","[0.205153346062, 0.1396882236, 0.134564653039,...","[0.233340471983, 0.219487249851, 0.15279327333...","[0.170585080981, 0.149688065052, 0.14306488633...","[0.296981394291, 0.170585080981, 0.14968806505...","[0.481546670198, 0.205153346062, 0.16470108926...","[0.416887819767, 0.099450416863, 0.09853885322...","[0.143505915999, 0.0985388532281, 0.0437860004...","[0.10189691186, 0.0939451083541, 0.09234624356...","[0.481546670198, 0.354385614395, 0.12554837763...",...,"[0.481546670198, 0.144014567137, 0.12838408350...","[0.481546670198, 0.199039384723, 0.17022721469...","[0.170227214694, 0.145951986313, 0.14401456713...","[0.190560564399, 0.185790881515, 0.16470108926...","[0.48552531004, 0.198464393616, 0.164701089263...","[0.199039384723, 0.0923462435603, 0.0923462435...","[0.134564653039, 0.0985388532281, 0.0935578867...","[0.149701923132, 0.111775815487, 0.10959290713...","[0.0664640441537, 0.0462840273976, 0.033325906...","[0.170227214694, 0.170227214694, 0.14401456713..."
supervisor,"[0.180711179972, 0.128739342093, 0.12695443630...","[0.248682126403, 0.189695432782, 0.18734870851...","[0.329583823681, 0.222308069468, 0.19156195223...","[0.234472796321, 0.180711179972, 0.17012850940...","[0.319492906332, 0.168916925788, 0.13902001082...","[0.308876425028, 0.189695432782, 0.18965531885...","[0.296274900436, 0.153294757009, 0.13860934972...","[0.0852858126163, 0.0439815595746, 0.037485312...","[0.0956757366657, 0.0888340473175, 0.067390158...","[0.42653849721, 0.308876425028, 0.147516652942...",...,"[0.308876425028, 0.18734870851, 0.149399071932...","[0.308876425028, 0.189655318856, 0.18734870851...","[0.163845270872, 0.138609349728, 0.13439351320...","[0.232190147042, 0.135539248586, 0.11766532063...","[0.247589588165, 0.244587942958, 0.18360872566...","[0.172233775258, 0.088384449482, 0.08562444895...","[0.248682126403, 0.138609349728, 0.12167339772...","[0.138609349728, 0.0882289782166, 0.0856244489...","[0.0978016108274, 0.0670163482428, 0.053387433...","[0.138609349728, 0.114625647664, 0.11462564766..."
training,"[0.328027784824, 0.157236814499, 0.15316183865...","[0.332225173712, 0.218692332506, 0.21767824888...","[0.379619985819, 0.333332300186, 0.32802778482...","[0.328027784824, 0.173887431622, 0.15339475870...","[0.238453179598, 0.173887431622, 0.15316183865...","[0.328027784824, 0.266709834337, 0.21598142385...","[0.18020825088, 0.173887431622, 0.168361693621...","[0.163917139173, 0.153161838651, 0.05462649837...","[0.17098043859, 0.167760655284, 0.147666171193...","[0.215981423855, 0.20948125422, 0.179933622479...",...,"[0.332225173712, 0.179933622479, 0.17388743162...","[0.264780700207, 0.224284797907, 0.17702740430...","[0.217678248882, 0.18020825088, 0.177027404308...","[0.179933622479, 0.179933622479, 0.16083866357...","[0.18020825088, 0.173887431622, 0.166913762689...","[0.264780700207, 0.179933622479, 0.17993362247...","[0.217678248882, 0.18020825088, 0.158700019121...","[0.18020825088, 0.157236814499, 0.126504480839...","[0.107653222978, 0.0763488039374, 0.0655273273...","[0.18020825088, 0.179933622479, 0.177027404308..."
safety,"[0.358425438404, 0.313093900681, 0.22312483191...","[0.188115343451, 0.178278505802, 0.17643743753...","[0.223124831915, 0.188970014453, 0.18811534345...","[0.223124831915, 0.164930209517, 0.15204392373...","[0.223124831915, 0.176437437534, 0.16897088289...","[0.31819370389, 0.252442806959, 0.208838164806...","[0.229886591434, 0.223124831915, 0.20186658203...","[0.123344272375, 0.0986144989729, 0.0343389622...","[0.174457937479, 0.13840135932, 0.129685774446...","[0.223124831915, 0.188115343451, 0.17445793747...",...,"[0.31819370389, 0.164305880666, 0.150094628334...","[0.223124831915, 0.223124831915, 0.17445793747...","[0.223124831915, 0.201866582036, 0.17445793747...","[0.223124831915, 0.223124831915, 0.22090412676...","[0.201866582036, 0.188970014453, 0.17445793747...","[0.223124831915, 0.120276413858, 0.08856157958...","[0.223124831915, 0.201866582036, 0.17445793747...","[0.31819370389, 0.223124831915, 0.201866582036...","[0.136475786567, 0.101886935532, 0.08847917616...","[0.223124831915, 0.223124831915, 0.20186658203..."


In [9]:
# combo.pivot(index='inspection_id', columns='enumerated_review_delta')
combo.set_index(['inspection_id', 'enumerated_review_delta']).unstack(1)
# combo.pivot_table(index='inspection_id', columns='enumerated_review_delta')

Unnamed: 0_level_0,polarity,polarity,polarity,polarity,polarity,polarity,polarity,polarity,polarity,polarity,...,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite,user_ever_elite
enumerated_review_delta,0,1,2,3,4,5,6,7,8,9,...,435,436,437,438,439,440,441,442,443,444
inspection_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.222946,0.237602,,,,,,,,,...,,,,,,,,,,
1,0.189981,0.115152,0.324259,,,,,,,,...,,,,,,,,,,
2,0.005025,-0.234226,0.700000,0.158102,0.019551,0.255694,0.250000,0.343308,0.173016,,...,,,,,,,,,,
7,0.193750,,,,,,,,,,...,,,,,,,,,,
8,0.214286,-0.059524,0.166833,0.030000,0.259574,0.500000,,,,,...,,,,,,,,,,
9,-0.225000,-0.156003,0.171605,,,,,,,,...,,,,,,,,,,
10,-0.300000,0.021298,0.164500,0.069048,0.247436,,,,,,...,,,,,,,,,,
12,0.070000,0.016346,-0.222222,0.600000,,,,,,,...,,,,,,,,,,
13,0.196212,0.156618,0.187500,0.422240,0.268182,0.050000,0.000000,0.047024,0.534500,0.259127,...,,,,,,,,,,
14,-0.059730,-0.283333,-0.208333,0.125757,-0.109040,0.266333,0.045290,0.085227,-0.433333,0.315758,...,,,,,,,,,,


In [186]:
X, y = extract_features(pd.merge(combo, df[['inspection_id', 'enumerated_review_delta', 'score_lvl_1', 'score_lvl_2', 'score_lvl_3']], on=['inspection_id', 'enumerated_review_delta']))

In [10]:
df

Unnamed: 0,restaurant_id,review_id,review_stars,review_votes_cool,review_votes_funny,review_votes_useful,user_average_stars,user_compliments_cool,user_compliments_cute,user_compliments_funny,...,restaurant_category_6,restaurant_category_7,review_delta,previous_inspection_delta,inspection_year,inspection_month,inspection_day,inspection_dayofweek,inspection_quarter,enumerated_review_delta
3673133,njoZyY3r,jlKVXCHaSrj-cgYBa-bshQ,3,0,0,0,3.27,,,,...,,,0,168,2014,6,23,0,2,0
3416909,eVOBNMOj,FFJmcIuQyG5EW9f4MvLC6w,4,0,0,1,3.68,8,,6,...,Restaurants,,0,115,2010,4,15,3,2,0
2640320,WwOaXpoB,miL2NBTI2P5dRQoTci0xgg,5,0,0,1,3.76,2,1,,...,,,0,14,2012,4,10,1,2,0
3415000,eVOBNMOj,8ehwidyYdEwLjh0Q9iJnvg,3,0,0,0,2.86,,,,...,Restaurants,,0,35,2011,9,30,4,3,0
2641188,0ZEDQG3D,__nql_JwgOdCOoVCvGHOJg,3,0,0,0,3.86,17,1,1,...,,,0,8,2014,1,7,1,1,0
2641304,0ZEDQG3D,NHCQTcV0kOCHqviPkjzUhQ,4,2,0,0,3.66,4,,1,...,,,0,169,2012,7,6,4,3,0
1720355,lnORKGEN,HupcUbv2BP7f_SSQy1gaAw,4,0,0,0,3.68,2,,6,...,,,0,176,2015,1,30,4,1,0
3769370,V430Lq3B,yy6bepLQ-nCR-Tdh1ucodw,5,0,0,0,4.50,,,,...,,,0,227,2013,8,14,2,3,0
3769371,V430Lq3B,rpi00iBTaLxMj7TCG9q2qw,3,0,0,0,3.79,,,,...,,,0,227,2013,8,14,2,3,1
2219340,dj3dqNo9,1rqAvSTcpNV5uf-MTBDczg,4,0,0,0,3.76,20,,20,...,,,0,83,2012,3,22,3,1,0


In [189]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
import sendMessage


from sklearn.preprocessing import MinMaxScaler

t0 = time()


# set classifiers to test
estimator = LinearRegression()
# estimator = RandomForestClassifier(n_jobs=-1, random_state=42)
# estimator = SGDClassifier(n_jobs=-1, random_state=42)
# estimator = Perceptron(n_jobs=-1, random_state=42)  # gets some nuances
# estimator = SGDRegressor() # gets some nuances
# estimator = KNeighborsClassifier()
# estimator = KNeighborsRegressor()  # gets some nuances
# estimator = DecisionTreeClassifier()
# estimator = DecisionTreeRegressor()
# estimator = GaussianNB()
# estimator = MultinomialNB()
# estimator = LinearSVC(random_state=42)

pipeline = Pipeline([
#         ('zero_variance_removal', VarianceThreshold()),
#         ('k_best', SelectKBest(score_func=f_classif, k=20)),
#         ('no_negative', MinMaxScaler()),
#         ('normalizer', Normalizer()),
#         ('normalizer', Normalizer(norm='l2')), #  for text classification and clustering
#         ('normalizer', Normalizer(copy=False)),
#         ('scaler', StandardScaler()),
#         ('scaler', StandardScaler(with_mean=False)), #  for sparse matrix
        ('clf', estimator),
])

p1,p2,p3,ytest = raw_fit(X, y, pipeline)
raw_scoring(p1,p2,p3,ytest)


print("{} seconds elapsed".format(time()-t0))

sendMessage.doneTextSend(t0, time(), 'model test')




ValueError: setting an array element with a sequence.