In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
import metrics
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import text_processors
from progressbar import ProgressBar
import data_grab
from time import time
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_selection import VarianceThreshold


In [4]:
def contest_metric(numpy_array_predictions, numpy_array_actual_values):
    return metrics.weighted_rmsle(numpy_array_predictions, numpy_array_actual_values,
            weights=metrics.KEEPING_IT_CLEAN_WEIGHTS)

In [5]:
def contest_scoring(X, y, pipeline):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    s1 = pipeline.fit(X_train, y_train['score_lvl_1']).predict(X_test)
    s2 = pipeline.fit(X_train, y_train['score_lvl_2']).predict(X_test)
    s3 = pipeline.fit(X_train, y_train['score_lvl_3']).predict(X_test)
    results = np.dstack((s1, s2, s3))
    score = contest_metric(np.round(results[0]), np.array(y_test))
    print("Contest score of {}".format(score))
    return score

In [6]:
from sklearn.metrics import accuracy_score

def raw_scoring(p1, p2, p3, ytrue):
    '''since cross_val_score doesn't allow you to round the results beforehand. also for pymc3 and other non-sklearn models'''
    score1 = accuracy_score(ytrue['score_lvl_1'], np.clip(np.round(p1), 0, np.inf))
    print("Level 1 accuracy score of {}".format(score1))
    score2 = accuracy_score(ytrue['score_lvl_2'],np.clip(np.round(p2), 0, np.inf))
    print("Level 2 accuracy score of {}".format(score2))
    score3 = accuracy_score(ytrue['score_lvl_3'], np.clip(np.round(p3), 0, np.inf))
    print("Level 3 accuracy score of {}".format(score3))
    
    results = np.dstack((p1, p2, p3))[0]
    rounded = np.clip(np.round(results), 0, np.inf)
    score = contest_metric(rounded, np.array(ytrue))
    print("Contest score of {}".format(score))
    
    compare = pd.concat([pd.DataFrame(np.concatenate((results, rounded), axis=1)), ytrue.reset_index(drop=True)], axis=1)
    compare.columns = ['pred1','pred2','pred3','round1','round2','round3','true1','true2','true3']
    compare['offset1'] = compare.round1-compare.true1
    compare['offset2'] = compare.round2-compare.true2
    compare['offset3'] = compare.round3-compare.true3
        
    return score1, score2, score3, score, compare.head(10)

    
def raw_fit(X, y, pipeline):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42)
    
    p1 = pipeline.fit(xtrain, ytrain['score_lvl_1']).predict(xtest)
    p2 = pipeline.fit(xtrain, ytrain['score_lvl_2']).predict(xtest)
    p3 = pipeline.fit(xtrain, ytrain['score_lvl_3']).predict(xtest)
        
    return p1, p2, p3, ytest

In [7]:
def extract_features(df):
    features = df.drop(['score_lvl_1', 'score_lvl_2', 'score_lvl_3'], axis=1)
    response = df[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']].astype(np.int8)
    
    return features, response

In [10]:
df = pd.read_pickle('pickle_jar/review_text_sentiment_flat')

In [11]:
df

Unnamed: 0,restaurant_id,inspection_date,inspection_id,review_text,score_lvl_1,score_lvl_2,score_lvl_3,preprocessed_review_text
0,N6Ok7qOx,2007-11-08,23100,This is a pretty typical cafe. The sandwiches...,2,0,0,this pretty typical cafe the sandwich wrap goo...
1,p03824Om,2012-08-02,27889,This is the place I like to go for deli sandwi...,7,0,1,this place i like go deli sandwich financial d...
2,p03824Om,2011-11-03,11070,This is the place I like to go for deli sandwi...,2,0,0,this place i like go deli sandwich financial d...
3,p03824Om,2010-04-08,8714,This is the place I like to go for deli sandwi...,7,0,1,this place i like go deli sandwich financial d...
4,p03824Om,2009-12-10,5843,This is the place I like to go for deli sandwi...,0,0,0,this place i like go deli sandwich financial d...
5,p03824Om,2011-11-28,29966,This is the place I like to go for deli sandwi...,1,0,0,this place i like go deli sandwich financial d...
6,p03824Om,2008-07-03,19729,This is the place I like to go for deli sandwi...,2,0,1,this place i like go deli sandwich financial d...
7,p03824Om,2008-12-31,29636,This is the place I like to go for deli sandwi...,2,0,0,this place i like go deli sandwich financial d...
8,p03824Om,2010-04-01,11606,This is the place I like to go for deli sandwi...,7,0,1,this place i like go deli sandwich financial d...
9,p03824Om,2011-04-29,14517,This is the place I like to go for deli sandwi...,0,0,0,this place i like go deli sandwich financial d...


In [6]:
# df = pd.read_pickle('pickle_jar/review_text_sentiment_hierarchical_df')
# prep = pd.read_pickle('pickle_jar/preprocessed_review_text_hierarchical_df')
# df = pd.concat([df, prep.preprocessed_review_text], axis=1)
sim = pd.read_pickle('pickle_jar/similarity_vectors_df')
# tfidf = joblib.load('pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_hierarchical_dropna')
# matrix = joblib.load('pickle_jar/similarity_matrix')

In [15]:
sim = sim[['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold']]

In [20]:
def get_out(x):
    try:
        return list(x)
    except:
        return x
sim.manager.apply(get_out)


0          [0.102025292814, 0.102025292814, 0.09906487166...
1          [0.222249120474, 0.198464393616, 0.19058281183...
2          [0.164701089263, 0.0990648716688, 0.0968299135...
3          [0.0985388532281, 0.0823464468122, 0.071328066...
4          [0.222249120474, 0.158419817686, 0.14401456713...
5                                                        NaN
6          [0.196376562119, 0.152856841683, 0.15220764279...
7          [0.296981394291, 0.170809492469, 0.13077272474...
8          [0.221467807889, 0.170809492469, 0.12393315881...
9          [0.219487249851, 0.164701089263, 0.15220764279...
10         [0.481546670198, 0.296981394291, 0.17592954635...
11         [0.296981394291, 0.190560564399, 0.17080949246...
12         [0.170227214694, 0.149701923132, 0.12393315881...
13         [0.296981394291, 0.183491870761, 0.18349187076...
14         [0.296981394291, 0.207707315683, 0.14350591599...
15         [0.190560564399, 0.170809492469, 0.15279327333...
16         [0.1947998553

In [18]:
df.previous_inspection_delta = df.previous_inspection_delta.fillna(0)
df.previous_inspection_delta = df.previous_inspection_delta.dt.days.astype(float)

In [19]:
def get_out(x):
    try:
        return x[0]
    except:
        return x


topics = ['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold', ]
pbar = ProgressBar(maxval=len(topics)).start()
for index, i in enumerate(topics):
    df[i] = sim[i].apply(get_out)
    pbar.update(index)
pbar.finish()

100% (53 of 53) |#########################| Elapsed Time: 0:03:46 Time: 0:03:46


In [13]:
# df = pd.concat([df,sim[topics]],axis=1)

In [24]:
del sim

In [25]:
train = data_grab.get_selects('train')

In [26]:
dropped = df.dropna(subset=['review_text'])
#1925254 

In [31]:
dropped['user_yelping_since_delta'] = (dropped.review_date - dropped.user_yelping_since).astype('timedelta64[D]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [32]:
dropped['user_most_recent_elite_year_delta'] = (dropped.review_date.dt.year - dropped.user_most_recent_elite_year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [33]:
dropped['restaurant_categories'] = train.restaurant_categories.apply(lambda x: sorted(x))
dropped['restaurant_neighborhoods'] = train.restaurant_neighborhoods.apply(lambda x: sorted(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [35]:
dropped.drop(['review_text', 'review_date', 'user_id', 'restaurant_full_address', 'restaurant_name',
         'inspection_date', 'inspection_id', 'inspection_date', 'sentiment', 'vader'], axis=1, inplace=True)
dropped.drop(['user_yelping_since', 'restaurant_attributes_by_appointment_only', 'restaurant_open', 'user_most_recent_elite_year'] , axis=1, inplace=True)
dropped.drop(['review_year',
 'review_month',
 'review_day',
 'review_dayofweek',
 'review_quarter',
 'review_dayofyear',
 'inspection_dayofyear',], axis=1, inplace=True)
# dropped.drop(['restaurant_neighborhoods', 'restaurant_categories'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
dropped['review_stars'] = dropped.review_stars.fillna(0).astype('category')
dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer', 'checkin_counts']] = dropped[['user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot',
 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain',
 'user_compliments_profile', 'user_compliments_writer', 'checkin_counts']].fillna(0)
dropped[['restaurant_attributes_ages_allowed',
         'restaurant_attributes_alcohol', 
         'restaurant_attributes_attire', 
         'restaurant_attributes_byob_corkage', 
         'restaurant_attributes_noise_level', 
         'restaurant_attributes_smoking', 
         'restaurant_attributes_wifi']] = dropped[['restaurant_attributes_ages_allowed',
                                                   'restaurant_attributes_alcohol', 
                                                   'restaurant_attributes_attire', 
                                                   'restaurant_attributes_byob_corkage', 
                                                   'restaurant_attributes_noise_level', 
                                                   'restaurant_attributes_smoking', 
                                                   'restaurant_attributes_wifi']].convert_objects().fillna('nan')
dropped[['restaurant_hours_friday_close',
 'restaurant_hours_friday_open',
 'restaurant_hours_monday_close',
 'restaurant_hours_monday_open',
 'restaurant_hours_saturday_close',
 'restaurant_hours_saturday_open',
 'restaurant_hours_sunday_close',
 'restaurant_hours_sunday_open',
 'restaurant_hours_thursday_close',
 'restaurant_hours_thursday_open',
 'restaurant_hours_tuesday_close',
 'restaurant_hours_tuesday_open',
 'restaurant_hours_wednesday_close',
 'restaurant_hours_wednesday_open']] = dropped[[ 'restaurant_hours_friday_close',
 'restaurant_hours_friday_open',
 'restaurant_hours_monday_close',
 'restaurant_hours_monday_open',
 'restaurant_hours_saturday_close',
 'restaurant_hours_saturday_open',
 'restaurant_hours_sunday_close',
 'restaurant_hours_sunday_open',
 'restaurant_hours_thursday_close',
 'restaurant_hours_thursday_open',
 'restaurant_hours_tuesday_close',
 'restaurant_hours_tuesday_open',
 'restaurant_hours_wednesday_close',
 'restaurant_hours_wednesday_open']].convert_objects().fillna('nan')
dropped[['restaurant_ambience',
         'restaurant_music',
         'restaurant_parking',
         'restaurant_zipcode']] = dropped[['restaurant_ambience',
                                            'restaurant_music',
                                            'restaurant_parking',
                                            'restaurant_zipcode']].convert_objects().fillna('nan')
dropped.user_most_recent_elite_year_delta = dropped.user_most_recent_elite_year_delta.fillna(dropped.user_most_recent_elite_year_delta.median())
dropped.restaurant_attributes_price_range = dropped.restaurant_attributes_price_range.fillna(dropped.restaurant_attributes_price_range.median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [41]:
dropped[['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 
         'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 
         'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 
         'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 
         'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 
         'pesticide', 'bugs', 'mold']] = dropped[['manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 
                                                  'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 
                                                  'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 
                                                  'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry',
                                                  'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 
                                                  'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old',
                                                  'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 
                                                  'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 
                                                  'pesticide', 'bugs', 'mold']].fillna(0)

In [12]:
dropped = pd.read_pickle('pickle_jar/final_dropped')
# dropped.to_pickle('pickle_jar/final_dropped')

In [7]:
dropped.shape

(1925254, 177)

In [45]:
del df
del train

In [169]:
dropped.user_most_recent_elite_year_delta.dropna().shape

(759039,)

In [19]:
dropped.user_most_recent_elite_year_delta.value_counts(dropna=False, sort=True)

-3     1284912
-2      143340
-1      136361
-4       98223
-5       74546
-6       56323
 0       53969
-7       40708
-8       13722
 1        9392
-9        7155
 2        3224
 3        1631
-10        953
 4         541
 5         149
 6          51
-11         32
 7          22
dtype: int64

In [39]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer

In [176]:
cats = []
for i in ['restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

def proper_array(x, backfill_size=7):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return zeros

t = dropped.restaurant_categories.apply(proper_array)

enc = OneHotEncoder(sparse=True)
e = enc.fit_transform(np.vstack(t))

In [208]:
cats = []
for i in ['restaurant_neighborhood_1', 'restaurant_neighborhood_2', 'restaurant_neighborhood_3']:
    cats.extend(train[i].unique().tolist())
cats = set(cats)
cats.remove(np.nan)
cats = sorted(cats)

t = dropped.restaurant_neighborhoods.apply(proper_array, args=(3,))

enc = OneHotEncoder(sparse=True)
e = enc.fit_transform(np.vstack(t))

def proper_array(x, backfill_size=3):
    encoder_prep = lambda x: cats.index(x)
    temp = map(encoder_prep, x)
    zeros = np.zeros(backfill_size, dtype='int')
    zeros[:len(temp)] = temp
    return temp


In [64]:
enc_label = LabelEncoder()
# el = enc_label.fit_transform(np.vstack(t)[:,0])
# el = enc_label.fit_transform(dropped.restaurant_id)

In [67]:
enc_label.fit_transform(dropped.restaurant_attributes_accepts_credit_cards)

array([1, 1, 1, ..., 1, 1, 1])

In [21]:
def add_categorical_to_matrix(matrix, df, columns):
    lb = LabelBinarizer(sparse_output=True)
    for i in columns:
        binarized = lb.fit_transform(df[i])
        matrix = hstack([matrix, binarized])
    return matrix

lb = LabelBinarizer(sparse_output=True)
m = lb.fit_transform(dropped.restaurant_id)

NameError: name 'LabelBinarizer' is not defined

In [28]:
dropped.restaurant_stars.dtypes
lb = LabelBinarizer(sparse_output=True)
test = train.dropna(subset=['review_text']).restaurant_stars
lb.fit_transform(np.array(test, dtype='|S'))

<1925254x14 sparse matrix of type '<type 'numpy.int64'>'
	with 1925254 stored elements in Compressed Sparse Row format>

In [46]:
temp = pd.DataFrame(dropped.restaurant_categories.tolist())

(1925254, 82)
(1925254, 95)


In [66]:
# need to do it like this because pd.merge causes a memory overload
['restaurant_category_1',
 'restaurant_category_2',
 'restaurant_category_3',
 'restaurant_category_4',
 'restaurant_category_5',
 'restaurant_category_6',
 'restaurant_category_7']:
t0 = pd.get_dummies(temp[0])
for i in range(1, 7):
    new_dummies = pd.get_dummies(temp[i])
    pbar = ProgressBar(maxval=len(new_dummies.columns)).start()
    for index, column in enumerate(new_dummies.columns):
        if column not in t0.columns:
            t0 = pd.concat([t0, new_dummies[column]], axis=1)
        else:
            t0[column] = t0[column] + new_dummies[column]
        pbar.update(index)
    pbar.finish()
            

100% (95 of 95) |#########################| Elapsed Time: 0:02:32 Time: 0:02:32
100% (87 of 87) |#########################| Elapsed Time: 0:01:38 Time: 0:01:38
100% (46 of 46) |#########################| Elapsed Time: 0:00:20 Time: 0:00:20
100% (29 of 29) |#########################| Elapsed Time: 0:00:23 Time: 0:00:23
100% (15 of 15) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00
100% (6 of 6) |###########################| Elapsed Time: 0:00:00 Time: 0:00:00


In [70]:
t0

(1925254, 158)

In [71]:
csr_matrix(t0)

<1925254x158 sparse matrix of type '<type 'numpy.float64'>'
	with 6416483 stored elements in Compressed Sparse Row format>

In [11]:
y = dropped[['score_lvl_1', 'score_lvl_2', 'score_lvl_3']]

<1923536x10881 sparse matrix of type '<type 'numpy.int64'>'
	with 1923536 stored elements in Compressed Sparse Row format>

In [24]:
matrix = joblib.load('pickle_jar/similarity_matrix5')

In [111]:
def make_bins(df, bin_size=10):
    # time delta bins
    tdmax = df.review_delta.max()
    tdmin = df.review_delta.min()
    df['review_delta_bin'] = pd.cut(df["review_delta"], np.arange(tdmin, tdmax, bin_size))
    df['review_delta_bin_codes'] = df.review_delta_bin.astype('category').cat.codes
    tdmax = df.previous_inspection_delta.max()
    tdmin = df.previous_inspection_delta.min()
    df['previous_inspection_delta_bin'] = pd.cut(df["previous_inspection_delta"], np.arange(tdmin-1, tdmax, bin_size))
    df['previous_inspection_delta_bin_codes'] = df.previous_inspection_delta_bin.astype('category').cat.codes
    return df
df = make_bins(df)

##Feature selection

In [13]:
X = dropped[['review_votes_cool', 'review_votes_funny', 'review_votes_useful', 'user_average_stars', 'user_compliments_cool', 'user_compliments_cute', 'user_compliments_funny', 'user_compliments_hot', 'user_compliments_list', 'user_compliments_more', 'user_compliments_note', 'user_compliments_photos', 'user_compliments_plain', 'user_compliments_profile', 'user_compliments_writer', 'user_fans', 'user_review_count', 'user_votes_cool', 'user_votes_funny', 'user_votes_useful', 'restaurant_attributes_price_range', 'restaurant_latitude', 'restaurant_longitude', 'restaurant_review_count', 'checkin_counts', 'review_delta', 'previous_inspection_delta', 'polarity', 'subjectivity', 'neg', 'neu', 'pos', 'compound', 'user_yelping_since_delta','manager', 'supervisor', 'training', 'safety', 'disease', 'ill', 'sick', 'poisoning', 'hygiene', 'raw', 'undercooked', 'cold', 'clean', 'sanitary', 'wash', 'jaundice', 'yellow', 'hazard', 'inspection', 'violation', 'gloves', 'hairnet', 'nails', 'jewelry', 'sneeze', 'cough', 'runny', 'illegal', 'rotten', 'dirty', 'mouse', 'cockroach', 'contaminated', 'gross', 'disgusting', 'stink', 'old', 'parasite', 'reheat', 'frozen', 'broken', 'drip', 'bathroom', 'toilet', 'leak', 'trash', 'dark', 'lights', 'dust', 'puddle', 'pesticide', 'bugs', 'mold']]

In [14]:
y = joblib.load('pickle_jar/final_y')

In [15]:
print dropped.shape
print X.shape
print y.shape

(1925254, 177)
(1925254, 87)
(1925254, 3)


In [13]:
from sklearn.feature_selection import VarianceThreshold

def varthresh(X):
    vt = VarianceThreshold()
    new = vt.fit_transform(X)
    print X.shape
    print new.shape

varthresh(X)

(1925254, 87)
(1925254, 87)


In [16]:
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif, SelectFpr
from sklearn.preprocessing import MinMaxScaler

def univariate(X, y, type):
#     mms = MinMaxScaler()
#     X_new = SelectKBest(chi2, k=2).fit_transform(mms.fit_transform(X), y)
    kb = SelectKBest(type)
    X_new = kb.fit_transform(X, y)
    # index number of column sorted from most important to least important
    ranking = np.argsort(kb.scores_)
    print("Five least important features: \n{}".format('\n'.join(X[ranking[:20]].columns)))
    print("\nFive most important features: \n{}".format('\n'.join(X[ranking[:-21:-1]].columns)))


In [17]:
X_new = univariate(X, y.score_lvl_1, f_classif)

Five least important features: 
user_compliments_photos
user_compliments_profile
user_compliments_list
user_compliments_more
user_compliments_cute
user_compliments_writer
user_compliments_plain
user_compliments_hot
user_compliments_cool
user_compliments_funny
user_compliments_note
user_fans
user_votes_cool
inspection
user_votes_funny
drip
broken
violation
user_votes_useful
hazard

Five most important features: 
previous_inspection_delta
checkin_counts
restaurant_attributes_price_range
restaurant_latitude
restaurant_review_count
restaurant_longitude
review_delta
sanitary
hygiene
user_yelping_since_delta
undercooked
lights
cockroach
parasite
reheat
contaminated
dark
frozen
poisoning
runny


In [18]:
X_new = univariate(X, y.score_lvl_2, f_classif)

Five least important features: 
user_compliments_photos
puddle
broken
user_compliments_profile
inspection
illegal
user_compliments_cute
user_compliments_more
wash
sneeze
trash
bathroom
user_compliments_list
mouse
user_compliments_plain
user_compliments_funny
user_compliments_note
user_compliments_cool
leak
user_compliments_hot

Five most important features: 
previous_inspection_delta
restaurant_longitude
restaurant_latitude
checkin_counts
restaurant_review_count
sanitary
hygiene
lights
restaurant_attributes_price_range
cockroach
dark
review_delta
contaminated
undercooked
poisoning
parasite
safety
reheat
pesticide
rotten


In [19]:
X_new = univariate(X, y.score_lvl_3, f_classif)

Five least important features: 
user_compliments_list
user_compliments_photos
user_compliments_profile
user_compliments_cute
user_compliments_note
user_compliments_funny
user_fans
hazard
user_compliments_hot
user_compliments_plain
sneeze
user_compliments_cool
user_compliments_more
violation
inspection
user_compliments_writer
user_votes_funny
disgusting
user_votes_cool
sick

Five most important features: 
previous_inspection_delta
restaurant_longitude
restaurant_review_count
restaurant_attributes_price_range
checkin_counts
restaurant_latitude
cockroach
undercooked
review_delta
bathroom
old
sanitary
hygiene
lights
runny
parasite
reheat
user_yelping_since_delta
frozen
toilet


In [113]:
from sklearn.feature_selection import RFECV, RFE

def recurs(X, y):
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=10)
    X_new = rfe.fit(X, y)
    return X_new

def recurscv(X, y):
    rfecv = RFECV(estimator=SGDClassifier(n_jobs=-1), scoring='accuracy', cv=3)
    X_new = rfecv.fit(X, y)
    return X_new

In [116]:
# reduce dimensionality of data selecting for non-zero coefficients
from sklearn.linear_model import RandomizedLogisticRegression 
from sklearn.svm import LinearSVC

def svcbased(X, y):
    # can remove features if they are closely correlated
    svc = LinearSVC()
    m = svc.fit_transform(X, y)
    return m

def rlrbased(X, y):
    rlr = RandomizedLogisticRegression()
    m = rlr.fit_transform(X, y)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

def tree(X, y):
    forest = ExtraTreesClassifier(n_estimators=10, random_state=42, n_jobs=-1)
    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(10):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(10), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(10), indices)
    plt.xlim([-1, 10])

tree(X, y)

In [None]:
dropped = pd.read_pickle('pickle_jar/final_dropped')

In [17]:
# X = joblib.load('pickle_jar/final_matrix')
# X = joblib.load('pickle_jar/specials_matrix')
X = joblib.load('pickle_jar/categorical_matrix')
y = joblib.load('pickle_jar/final_y')

In [11]:
tfidf = joblib.load( 'pickle_jar/tfidf_preprocessed_ngram3_sublinear_1mil_hierarchical_dropna')
# y = joblib.load('pickle_jar/final_y')

In [18]:
X.shape

(1925254, 14088)

In [19]:
y.shape

(1925254, 3)

In [14]:
tfidf.shape

(1925254, 1000000)

In [20]:
import sendMessage
from time import time

In [47]:
t0=time()

from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=2)
# lsa = TruncatedSVD(n_components=100)
tfidf = lsa.fit_transform(tfidf)

sendMessage.doneTextSend(t0, time(), 'LSA')

NameError: name 't1' is not defined

In [16]:
X = hstack([tfidf, X])
del tfidf

In [32]:
t0=time()
from sklearn.lda import LDA

ld = LDA()
X = ld.fit_transform(X, y)


sendMessage.doneTextSend(t0, time, 'LDA')

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [17]:
X.shape

(1925254, 1014175)

In [None]:
# np.hstack((np.array(dropped[['review_delta', 'previous_inspection_delta']]), X))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
import sendMessage


from sklearn.preprocessing import MinMaxScaler

t0 = time()


# set classifiers to test
# estimator = LinearRegression()
estimator = RandomForestClassifier(n_jobs=-1, random_state=42)
# estimator = SGDClassifier(n_jobs=-1, random_state=42)
# estimator = Perceptron(n_jobs=-1, random_state=42)  # gets some nuances
# estimator = SGDRegressor() # gets some nuances
# estimator = KNeighborsClassifier()
# estimator = KNeighborsRegressor()  # gets some nuances
# estimator = DecisionTreeClassifier()
# estimator = DecisionTreeRegressor()
# estimator = GaussianNB()
# estimator = MultinomialNB()
# estimator = LinearSVC(random_state=42)

pipeline = Pipeline([
#         ('zero_variance_removal', VarianceThreshold()),
#         ('k_best', SelectKBest(score_func=f_classif, k=20)),
#         ('no_negative', MinMaxScaler()),
#         ('normalizer', Normalizer()),
#         ('normalizer', Normalizer(norm='l2')), #  for text classification and clustering
#         ('normalizer', Normalizer(copy=False)),
#         ('scaler', StandardScaler()),
#         ('scaler', StandardScaler(with_mean=False)), #  for sparse matrix
        ('clf', estimator),
])

p1,p2,p3,ytest = raw_fit(X, y, pipeline)
raw_scoring(p1,p2,p3,ytest)


print("{} seconds elapsed".format(time()-t0))

sendMessage.doneTextSend(t0, time(), 'just categorical matrix on rf')



In [31]:
# baseline scores if guessing zero
scores = ['score_lvl_1', 'score_lvl_2', 'score_lvl_3']
guess = 0
for index, score in enumerate(scores):
    print("level {}: {}".format(index+1, y[score].value_counts(normalize=True)[guess]))

level 1: 0.22128560699
level 2: 0.692799495547
level 3: 0.573199692093


In [15]:
p = np.zeros(1925254)
raw_scoring(p, p, p, y)

Level 1 accuracy score of 0.22128560699
Level 2 accuracy score of 0.692799495547
Level 3 accuracy score of 0.573199692093
Contest score of 2.14265412097


(0.22128560699003871,
 0.69279949554708109,
 0.57319969209257582,
 2.1426541209656755,
    pred1  pred2  pred3  round1  round2  round3  true1  true2  true3  offset1  \
 0      0      0      0       0       0       0      2      0      0       -2   
 1      0      0      0       0       0       0      2      0      0       -2   
 2      0      0      0       0       0       0      2      0      0       -2   
 3      0      0      0       0       0       0      2      0      0       -2   
 4      0      0      0       0       0       0      2      0      0       -2   
 5      0      0      0       0       0       0      7      0      1       -7   
 6      0      0      0       0       0       0      7      0      1       -7   
 7      0      0      0       0       0       0      7      0      1       -7   
 8      0      0      0       0       0       0      7      0      1       -7   
 9      0      0      0       0       0       0      7      0      1       -7   
 
    offset2  offset3