In [1]:

import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
import metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import text_processors
from progressbar import ProgressBar
import data_grab
from time import time
from sklearn.feature_selection import VarianceThreshold
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack


In [2]:
def contest_metric(numpy_array_predictions, numpy_array_actual_values):
    return metrics.weighted_rmsle(numpy_array_predictions, numpy_array_actual_values,
            weights=metrics.KEEPING_IT_CLEAN_WEIGHTS)

In [3]:
def contest_scoring(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    s1 = pipeline.fit(X_train, y_train['score_lvl_1']).predict(X_test)
    s2 = pipeline.fit(X_train, y_train['score_lvl_2']).predict(X_test)
    s3 = pipeline.fit(X_train, y_train['score_lvl_3']).predict(X_test)
    results = np.dstack((s1, s2, s3))
    score = contest_metric(np.round(results[0]), np.array(y_test))
    print("Contest score of {}".format(score))
    return score

In [4]:
from sklearn.metrics import accuracy_score

def raw_scoring(p1, p2, p3, ytrue):
    '''since cross_val_score doesn't allow you to round the results beforehand. also for pymc3 and other non-sklearn models'''
    score1 = accuracy_score(ytrue['score_lvl_1'], np.round(p1))
    print("Level 1 accuracy score of {}".format(score1))
    score2 = accuracy_score(ytrue['score_lvl_2'], np.round(p2))
    print("Level 2 accuracy score of {}".format(score2))
    score3 = accuracy_score(ytrue['score_lvl_3'], np.round(p3))
    print("Level 3 accuracy score of {}".format(score3))
    
    results = np.dstack((p1, p2, p3))[0]
    score = contest_metric(np.round(results), np.array(ytrue))
    print("Contest score of {}".format(score))
    
    rounded = np.clip(np.round(results), 0, np.inf)
    compare = pd.concat([pd.DataFrame(np.concatenate((results, rounded), axis=1)), ytrue.reset_index(drop=True)], axis=1)
    compare.columns = ['pred1','pred2','pred3','round1','round2','round3','true1','true2','true3']
    compare['offset1'] = compare.round1-compare.true1
    compare['offset2'] = compare.round2-compare.true2
    compare['offset3'] = compare.round3-compare.true3
        
    return score1, score2, score3, score, compare.head(10)

    
def raw_fit(X, y, pipeline):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
    
    p1 = pipeline.fit(xtrain, ytrain['score_lvl_1']).predict(xtest)
    p2 = pipeline.fit(xtrain, ytrain['score_lvl_2']).predict(xtest)
    p3 = pipeline.fit(xtrain, ytrain['score_lvl_3']).predict(xtest)
        
    return p1, p2, p3, ytest

In [5]:
def extract_features(df):
    features = df.drop(['score_lvl_1', 'score_lvl_2', 'score_lvl_3'], axis=1)
    response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.int8)
    
    return features, response

In [17]:
df = pd.read_pickle('pickle_jar/review_text_sentiment_hierarchical_df')
# prep = pd.read_pickle('pickle_jar/preprocessed_review_text_hierarchical_df')
# df = pd.concat([df, prep.preprocessed_review_text], axis=1)
sim = pd.read_pickle('pickle_jar/similarity_vectors_df')
# tfidf = joblib.load('pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_hierarchical_dropna')
# matrix = joblib.load('pickle_jar/similarity_matrix')

In [18]:
df.previous_inspection_delta = df.previous_inspection_delta.fillna(0)
df.previous_inspection_delta = df.previous_inspection_delta.dt.days.astype(float)

In [19]:
def get_out(x):
    try:
        return x[0]
    except:
        return x


topics = ['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
pbar = ProgressBar(maxval=len(topics)).start()
for index, i in enumerate(topics):
    df[i] = sim[i].apply(get_out)
    pbar.update(index)
pbar.finish()

100% (53 of 53) |#########################| Elapsed Time: 0:03:46 Time: 0:03:46


In [13]:
# df = pd.concat([df,sim[topics]],axis=1)

In [24]:
del sim

In [25]:
train = data_grab.get_selects('train')

In [26]:
dropped = df.dropna(subset=['review_text'])

In [31]:
dropped['user_yelping_since_delta'] = (dropped.review_date - dropped.user_yelping_since).astype('timedelta64[D]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [32]:
dropped['user_most_recent_elite_year_delta'] = (dropped.review_date.dt.year - dropped.user_most_recent_elite_year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [33]:
dropped['restaurant_categories'] = train.restaurant_categories.apply(lambda x: sorted(x))
dropped['restaurant_neighborhoods'] = train.restaurant_neighborhoods.apply(lambda x: sorted(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [35]:
dropped.drop(['review_text', 'review_date', 'user_id', 'restaurant_full_address', 'restaurant_name',
         'inspection_date', 'inspection_id', 'inspection_date', 'sentiment', 'vader'], axis=1, inplace=True)
dropped.drop(['user_yelping_since', 'restaurant_attributes_by_appointment_only', 'restaurant_open', 'user_most_recent_elite_year'] , axis=1, inplace=True)
dropped.drop(['review_year',
 'review_month',
 'review_day',
 'review_dayofweek',
 'review_quarter',
 'review_dayofyear',
 'inspection_dayofyear',], axis=1, inplace=True)
# dropped.drop(['restaurant_neighborhoods', 'restaurant_categories'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
dropped['review_stars'] = dropped.review_stars.fillna(0).astype('category')
dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer', 'checkin_counts']] = dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer', 'checkin_counts']].fillna(0)
dropped[['restaurant_attributes_ages_allowed',
         'restaurant_attributes_alcohol', 
         'restaurant_attributes_attire', 
         'restaurant_attributes_byob_corkage', 
         'restaurant_attributes_noise_level', 
         'restaurant_attributes_smoking', 
         'restaurant_attributes_wifi']] = dropped[['restaurant_attributes_ages_allowed',
                                                   'restaurant_attributes_alcohol', 
                                                   'restaurant_attributes_attire', 
                                                   'restaurant_attributes_byob_corkage', 
                                                   'restaurant_attributes_noise_level', 
                                                   'restaurant_attributes_smoking', 
                                                   'restaurant_attributes_wifi']].convert_objects().fillna('nan')
dropped[['restaurant_hours_friday_close',
 'restaurant_hours_friday_open',
 'restaurant_hours_monday_close',
 'restaurant_hours_monday_open',
 'restaurant_hours_saturday_close',
 'restaurant_hours_saturday_open',
 'restaurant_hours_sunday_close',
 'restaurant_hours_sunday_open',
 'restaurant_hours_thursday_close',
 'restaurant_hours_thursday_open',
 'restaurant_hours_tuesday_close',
 'restaurant_hours_tuesday_open',
 'restaurant_hours_wednesday_close',
 'restaurant_hours_wednesday_open']] = dropped[[ 'restaurant_hours_friday_close',
 'restaurant_hours_friday_open',
 'restaurant_hours_monday_close',
 'restaurant_hours_monday_open',
 'restaurant_hours_saturday_close',
 'restaurant_hours_saturday_open',
 'restaurant_hours_sunday_close',
 'restaurant_hours_sunday_open',
 'restaurant_hours_thursday_close',
 'restaurant_hours_thursday_open',
 'restaurant_hours_tuesday_close',
 'restaurant_hours_tuesday_open',
 'restaurant_hours_wednesday_close',
 'restaurant_hours_wednesday_open']].convert_objects().fillna('nan')
dropped[['restaurant_ambience',
         'restaurant_music',
         'restaurant_parking',
         'restaurant_zipcode']] = dropped[['restaurant_ambience',
                                            'restaurant_music',
                                            'restaurant_parking',
                                            'restaurant_zipcode']].convert_objects().fillna('nan')
dropped.user_most_recent_elite_year_delta = dropped.user_most_recent_elite_year_delta.fillna(dropped.user_most_recent_elite_year_delta.median())
dropped.restaurant_attributes_price_range = dropped.restaurant_attributes_price_range.fillna(dropped.restaurant_attributes_price_range.median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [41]:
dropped[['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 
         'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 
         'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 
         'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 
         'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 
         'pesticide', 'bugs', 'mold']] = dropped[['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 
                                                  'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 
                                                  'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 
                                                  'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry',
                                                  'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 
                                                  'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old',
                                                  'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 
                                                  'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 
                                                  'pesticide', 'bugs', 'mold']].fillna(0)

In [6]:
dropped = pd.read_pickle('pickle_jar/final_dropped')
# dropped.to_pickle('pickle_jar/final_dropped')

In [7]:
dropped.shape

(1925254, 177)

In [45]:
del df
del train

In [169]:
dropped.user_most_recent_elite_year_delta.dropna().shape

(759039,)

In [19]:
dropped.user_most_recent_elite_year_delta.value_counts(dropna=False, sort=True)

-3     1284912
-2      143340
-1      136361
-4       98223
-5       74546
-6       56323
 0       53969
-7       40708
-8       13722
 1        9392
-9        7155
 2        3224
 3        1631
-10        953
 4         541
 5         149
 6          51
-11         32
 7          22
dtype: int64

In [39]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer

In [176]:
cats = []
for i in ['restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

def proper_array(x, backfill_size=7):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return zeros

t = dropped.restaurant_categories.apply(proper_array)

enc = OneHotEncoder(sparse=True)
e = enc.fit_transform(np.vstack(t))

In [208]:
cats = []
for i in ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

t = dropped.restaurant_neighborhoods.apply(proper_array, args=(3,))

enc = OneHotEncoder(sparse=True)
e = enc.fit_transform(np.vstack(t))

def proper_array(x, backfill_size=3):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return temp


In [64]:
enc_label = LabelEncoder()
# el = enc_label.fit_transform(np.vstack(t)[:,0])
# el = enc_label.fit_transform(dropped.restaurant_id)

In [67]:
enc_label.fit_transform(dropped.restaurant_attributes_accepts_credit_cards)

array([1, 1, 1, ..., 1, 1, 1])

In [21]:
def add_categorical_to_matrix(matrix, df, columns):
    lb = LabelBinarizer(sparse_output=True)
    for i in columns:
        binarized = lb.fit_transform(df[i])
        matrix = hstack([matrix, binarized])
    return matrix

lb = LabelBinarizer(sparse_output=True)
m = lb.fit_transform(dropped.restaurant_id)

NameError: name 'LabelBinarizer' is not defined

In [251]:
# dropped.restauarant_stars = dropped.restaurant_stars.astype('category')
dropped.restaurant_stars = dropped.restaurant_stars.apply(lambda x: float(x))


# add_categorical_to_matrix(m, dropped, ['restaurant_stars'])


In [28]:
dropped.restaurant_stars.dtypes
lb = LabelBinarizer(sparse_output=True)
test = train.dropna(subset=['review_text']).restaurant_stars
lb.fit_transform(np.array(test, dtype='|S'))

<1925254x14 sparse matrix of type '<type 'numpy.int64'>'
	with 1925254 stored elements in Compressed Sparse Row format>

In [46]:
temp = pd.DataFrame(dropped.restaurant_categories.tolist())

(1925254, 82)
(1925254, 95)


In [66]:
# need to do it like this because pd.merge causes a memory overload
['restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7']:
t0 = pd.get_dummies(temp[0])
for i in range(1, 7):
    new_dummies = pd.get_dummies(temp[i])
    pbar = ProgressBar(maxval=len(new_dummies.columns)).start()
    for index, column in enumerate(new_dummies.columns):
        if column not in t0.columns:
            t0 = pd.concat([t0, new_dummies[column]], axis=1)
        else:
            t0[column] = t0[column] + new_dummies[column]
        pbar.update(index)
    pbar.finish()
            

100% (95 of 95) |#########################| Elapsed Time: 0:02:32 Time: 0:02:32
100% (87 of 87) |#########################| Elapsed Time: 0:01:38 Time: 0:01:38
100% (46 of 46) |#########################| Elapsed Time: 0:00:20 Time: 0:00:20
100% (29 of 29) |#########################| Elapsed Time: 0:00:23 Time: 0:00:23
100% (15 of 15) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00
100% (6 of 6) |###########################| Elapsed Time: 0:00:00 Time: 0:00:00


In [70]:
t0

(1925254, 158)

In [71]:
csr_matrix(t0)

<1925254x158 sparse matrix of type '<type 'numpy.float64'>'
	with 6416483 stored elements in Compressed Sparse Row format>

In [11]:
y = dropped[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

<1923536x10881 sparse matrix of type '<type 'numpy.int64'>'
	with 1923536 stored elements in Compressed Sparse Row format>

In [24]:
matrix = joblib.load('pickle_jar/similarity_matrix5')

In [111]:
def make_bins(df, bin_size=10):
    # time delta bins
    tdmax = df.review_delta.max()
    tdmin = df.review_delta.min()
    df['review_delta_bin'] = pd.cut(df["review_delta"], np.arange(tdmin, tdmax, bin_size))
    df['review_delta_bin_codes'] = df.review_delta_bin.astype('category').cat.codes
    tdmax = df.previous_inspection_delta.max()
    tdmin = df.previous_inspection_delta.min()
    df['previous_inspection_delta_bin'] = pd.cut(df["previous_inspection_delta"], np.arange(tdmin-1, tdmax, bin_size))
    df['previous_inspection_delta_bin_codes'] = df.previous_inspection_delta_bin.astype('category').cat.codes
    return df
df = make_bins(df)

In [34]:
scores = ['score_lvl_1', 'score_lvl_2', 'score_lvl_3']
# model_features = ['review_stars', 'review_delta', 'previous_inspection_delta','review_delta_bin', 'previous_inspection_delta_bin', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
# model_features = ['review_stars', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'pos', 'neu', 'compound']
model_features = ['review_delta', 'previous_inspection_delta']

X, y = extract_features(test[model_features + topics + scores].dropna())

print df.shape
print X.shape
print y.shape
# review_stars doesnt exist for every observation so reducing size even further

(4071065, 185)
(1923536, 55)
(1923536, 3)


##Feature selection

In [19]:
test = ['restaurant_attributes_accepts_credit_cards', 'restaurant_attributes_byob', 'restaurant_attributes_caters', 'restaurant_attributes_coat_check', 'restaurant_attributes_corkage', 'restaurant_attributes_delivery', 'restaurant_attributes_dietary_restrictions_dairy_free', 'restaurant_attributes_dietary_restrictions_gluten_free', 'restaurant_attributes_dietary_restrictions_halal', 'restaurant_attributes_dietary_restrictions_kosher', 'restaurant_attributes_dietary_restrictions_soy_free', 'restaurant_attributes_dietary_restrictions_vegan', 'restaurant_attributes_dietary_restrictions_vegetarian', 'restaurant_attributes_dogs_allowed', 'restaurant_attributes_drive_thr', 'restaurant_attributes_good_for_dancing', 'restaurant_attributes_good_for_groups', 'restaurant_attributes_good_for_breakfast', 'restaurant_attributes_good_for_brunch', 'restaurant_attributes_good_for_dessert', 'restaurant_attributes_good_for_dinner', 'restaurant_attributes_good_for_latenight', 'restaurant_attributes_good_for_lunch', 'restaurant_attributes_good_for_kids', 'restaurant_attributes_happy_hour', 'restaurant_attributes_has_tv', 'restaurant_attributes_open_24_hours', 'restaurant_attributes_order_at_counter', 'restaurant_attributes_outdoor_seating',  'restaurant_attributes_payment_types_amex', 'restaurant_attributes_payment_types_cash_only', 'restaurant_attributes_payment_types_discover', 'restaurant_attributes_payment_types_mastercard', 'restaurant_attributes_payment_types_visa', 'restaurant_attributes_take_out',  'restaurant_attributes_takes_reservations', 'restaurant_attributes_waiter_service', 'restaurant_attributes_wheelchair_accessible', ]

# from sklearn.feature_selection import RFECV
# rfecv = RFECV(estimator=SGDClassifier(n_jobs=-1), scoring=mean_squared_error)
# rfecv.fit(X, y.score_lvl_1)

from sklearn.metrics import mean_squared_error

# from sklearn.svm import LinearSVC
# X_new = LinearSVC(C=0.01, penalty="l1", dual=False).fit_transform(X, y.score_lvl_1)

# from sklearn.feature_selection import RFE
# rfe = RFE(estimator=LinearRegression(), n_features_to_select=3, step=1)
# rfe.fit(X, y.score_lvl_1)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
# test = SelectFwe(f_classif).fit_transform(X, y.score_lvl_1)

X_new = SelectKBest(f_classif, k=5).fit_transform(X, y.score_lvl_1)

In [49]:
matrix = joblib.load('pickle_jar/final_matrix')
y = joblib.load('pickle_jar/final_y')

In [48]:
from sklearn.linear_model import RandomizedLogisticRegression
test = ['review_stars', 'user_name', 'restaurant_stars', 'restaurant_attributes_ages_allowed', 'restaurant_attributes_alcohol', 'restaurant_attributes_attire', 'restaurant_attributes_byob_corkage', 'restaurant_attributes_noise_level', 'restaurant_attributes_smoking', 'restaurant_attributes_wifi', 'restaurant_city',  'restaurant_hours_friday_close', 'restaurant_hours_friday_open', 'restaurant_hours_monday_close', 'restaurant_hours_monday_open', 'restaurant_hours_saturday_close', 'restaurant_hours_saturday_open', 'restaurant_hours_sunday_close', 'restaurant_hours_sunday_open', 'restaurant_hours_thursday_close', 'restaurant_hours_thursday_open', 'restaurant_hours_tuesday_close', 'restaurant_hours_tuesday_open', 'restaurant_hours_wednesday_close', 'restaurant_hours_wednesday_open', 'restaurant_ambience', 'restaurant_music', 'restaurant_parking', 'restaurant_street', 'restaurant_zipcode',  'inspection_year', 'inspection_month', 'inspection_day', 'inspection_dayofweek', 'inspection_quarter',]
rlr = RandomizedLogisticRegression()
rlr.fit_transform(dropped[test], y)

ValueError: could not convert string to float: Huntington Ave

In [53]:
VarianceThreshold().fit_transform(matrix)

<1925254x14170 sparse matrix of type '<type 'numpy.float64'>'
	with 298242155 stored elements in Compressed Sparse Row format>

In [55]:
matrix

<1925254x14175 sparse matrix of type '<type 'numpy.float64'>'
	with 300167409 stored elements in COOrdinate format>

In [8]:
tfidf = joblib.load( 'pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_hierarchical_dropna')

In [10]:
y = joblib.load('pickle_jar/final_y')

In [12]:
y.shape

(1925254, 3)

In [14]:
tfidf.shape

(1924542, 1000000)

In [15]:
prep = pd.read_pickle('pickle_jar/preprocessed_review_text_hierarchical_df')

In [18]:
prep.preprocessed_review_text.dropna()

0          this pretty typical cafe the sandwich wrap goo...
1          i agree reviewer pretty typical financial dist...
2          decent enough food overprice just large soup a...
3          the muffin blueberry i never good blueberry su...
4          well well well look review restaurant lobby re...
5                                                           
6          this place i like go deli sandwich financial d...
7          delicato great place lunch go downtown crossin...
8          delicato surprisingly tasty place locate alley...
9          did ever notice matter get lunch downtown cost...
10         i think i eat every day solid six month last y...
11         whoa this place good monday friday morning mak...
12         this easily one favorite place get breakfast t...
13         everyday i love place the lady office i take g...
14         this beat path spot place find i would rate hi...
15         five star as i sit update review delicato i mu...
16         i always walk

In [56]:
t0 = time()

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB


# set classifiers to test
estimator = LinearRegression()
# estimator = RandomForestClassifier(n_jobs=-1, random_state=42)
# estimator = SGDClassifier(n_jobs=-1, random_state=42)
# estimator = Perceptron(n_jobs=-1, random_state=42)  # gets some nuances
# estimator = SGDRegressor() # gets some nuances
# estimator = KNeighborsClassifier()
# estimator = KNeighborsRegressor()  # gets some nuances
# estimator = DecisionTreeClassifier()
# estimator = DecisionTreeRegressor()
# estimator = GaussianNB()

pipeline = Pipeline([
#         ('low_var_removal', VarianceThreshold()),
#         ('normalizer', Normalizer()),
#         ('normalizer', Normalizer(norm='l2')), #  for text classification and clustering
#         ('scaler', StandardScaler()),
#         ('scaler', StandardScaler(with_mean=False)), #  for sparse matrix
        ('clf', estimator),
])

p1,p2,p3,ytest = raw_fit(matrix, y, pipeline)
raw_scoring(p1,p2,p3,ytest)


print("{} seconds elapsed".format(time()-t0))

# first representation of manager and mold plus deltas for SGDClassifier
# Level 1 accuracy score of 0.160254863959
# Level 2 accuracy score of 0.693050714933
# Level 3 accuracy score of 0.574146779681
# Contest score of 1.52839121903

# sparse matrix of just manager no-mean-false and the detla for sdg
# Level 1 accuracy score of 0.136340572778
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.41908388602

# first representation of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.0958193660009
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.2018434011

# first two representations of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.134818376157
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.73133292154

# first five representations of all similarity vecs plus deltas for SGD
# Level 1 accuracy score of 0.140464228379
# Level 2 accuracy score of 0.693059032948
# Level 3 accuracy score of 0.574184210745
# Contest score of 1.43479163855

Exception KeyboardInterrupt in 'zmq.backend.cython.message.Frame.__dealloc__' ignored


KeyboardInterrupt: 

In [74]:
# baseline scores if guessing zero
guess = 0
for index, score in enumerate(scores):
    print("level {}: {}".format(index+1, y[score].value_counts(normalize=True)[guess]))

level 1: 0.22128560699
level 2: 0.692799495547
level 3: 0.573199692093
