In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [2]:
#
# Read in training (and validation) data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
prior = prior.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# compute training target
#target = train.groupby('user_id'
#    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
#    ).reset_index(name='target')

print "done."

Reading and merging orders tables ...  done.


In [3]:
#
# Before splitting off the validation set, compute "product features".
# These are features that depend only on product_id; their values are the
# same for training, validation and testing.
#
nprods = len(products['product_id'].unique())
prod_features = pd.DataFrame(
    index=range(nprods),
    columns=['product_id'])
prod_features['product_id'] = products['product_id'].unique()
prod_features.head()

Unnamed: 0,product_id
0,1
1,2
2,3
3,4
4,5


In [4]:
#
#   1: Product reorder probability
#

#   number of users who ever had this product in their basket
x = prior.groupby('product_id').apply(
    lambda x: len(set(x['user_id']))).reset_index(name='nusers')

# number of users that reorderd the item
y = train.groupby('product_id').apply(
    lambda x: x['reordered'].sum()).reset_index(name='reordered')

x = x.merge(y, on='product_id', how='left').fillna(0)
x['reorder_prob'] =  x['reordered'] / 1.0 / x['nusers']

prod_features = prod_features.merge(
    x[['product_id', 'reorder_prob']], on='product_id', how='left')
prod_features.head()

Unnamed: 0,product_id,reorder_prob
0,1,0.068436
1,2,0.012821
2,3,0.081081
3,4,0.076923
4,5,0.166667


In [8]:
#
#   2: Department reorder probability
#
x = prior.groupby('department_id').apply(
    lambda x: sum(x['reordered']) / 1.0 / len(x) ).reset_index(name='dept_reorder_prob')
x = x.merge(products, on='department_id', how='left')

prod_features = prod_features.merge(
    x[['product_id', 'dept_reorder_prob']], on='product_id', how='left')

prod_features.head()
#
# CHECKME: Why do these numbers seem so different from (1)??
#

#
# ANSWER: Because you are computing slightly different things. The
# query below gives an number more similar to the category reorder
# rate.
#

#   number of users who ever had this product in their basket
# x = prior.groupby('product_id').apply(
#    lambda x: sum(x['reordered']) / 1.0 / len(x)).reset_index(name='reorder_prob2')

# prod_features = prod_features.merge(
#    x[['product_id', 'reorder_prob2']], on='product_id', how='left')
# prod_features.head()

Unnamed: 0,product_id,reorder_prob,dept_reorder_prob
0,1,0.068436,0.57418
1,2,0.012821,0.346721
2,3,0.081081,0.65346
3,4,0.076923,0.541885
4,5,0.166667,0.346721


In [24]:
#
#   3: Department ID heuristic split
#
prod_features = prod_features.merge(
    products[['product_id', 'department_id']], on='product_id', how='left')
prod_features['department_id'] = prod_features.apply(
    lambda x: 1 if x['department_id'] in [4, 16] else 0, axis=1)
prod_features.head()

# sanity check
# prod_features[prod_features['department_id'] == 1].merge(
#    products, on='product_id', how='left')['department_id_y'].unique()

Unnamed: 0,product_id,reorder_prob,dept_reorder_prob,department_id
0,1,0.068436,0.57418,0
1,2,0.012821,0.346721,0
2,3,0.081081,0.65346,0
3,4,0.076923,0.541885,0
4,5,0.166667,0.346721,0


In [30]:
#
#   4: People love to reorder organic 
#
prod_features = prod_features.merge(
    products[['product_id', 'product_name']], on='product_id', how='left')
prod_features['is_organic'] = prod_features.apply(
    lambda x: 1 if 'organic' in x['product_name'].lower() else 0, axis=1)
del prod_features['product_name']
prod_features.head()

Unnamed: 0,product_id,reorder_prob,dept_reorder_prob,department_id,is_organic
0,1,0.068436,0.57418,0,0
1,2,0.012821,0.346721,0,0
2,3,0.081081,0.65346,0,0
3,4,0.076923,0.541885,0,0
4,5,0.166667,0.346721,0,0


In [31]:
#
# Generate user features ... 
# FIXME make pretty and organized like prod_features
#

print "Computing user / user-product features ... "

prior_gb_up = prior.groupby(['user_id', 'product_id'])
prior_gb_u = prior.groupby('user_id')
# FIXME is there a way to reuse prior_gb_up here? 

Computing user / user-product features ... 


In [32]:
#   1: Number of times product has been ordered by user
print "\tnorders of product by user ... ",
user_prod_norders = prior_gb_up.apply(len).reset_index(name='user_prod_norders')
print "done."

	norders of product by user ...  done.


In [33]:
#   2: Total number of orders by user
print "\tnorders by user ... ",
user_norders = prior_gb_u.apply(
    lambda x: len(set(x['order_id']))).reset_index(name='user_norders')
# FIXME you can also get this from orders table ... probably quicker
print "done."

	norders by user ...  done.


In [34]:
#   3: Fraction of user baskets containing product
print "\torder rate of product by user ... ",
user_prod_rate = user_prod_norders.merge(user_norders, on='user_id', how='left')
user_prod_rate['user_prod_rate'] = user_prod_rate['user_prod_norders'] / 1.0 / user_prod_rate['user_norders']

del user_prod_rate['user_prod_norders'], user_prod_rate['user_norders']
# FIXME I guess you could delete the other tables instead, and this is then a bit
# more handy to carry around, but it doesn't play nice with jupyter, where
# I want cell execution independence ...

print "done."

	order rate of product by user ...  done.


In [35]:
#   4: Number of baskets since user last ordered this item
# FIXME why is this so slow?!
print "\tbaskets since last order of product ... ",
baskets_since = prior_gb_up.apply(
    lambda x: max(x['order_number'])).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['baskets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']
print "done."

	baskets since last order of product ...  done.


In [36]:
#   5: Number of baskets since user first ordered this item
# FIXME why is this so slow?!
print "\tbaskets since first order of product ... ",
baskets_since_1st = prior_gb_up.apply(
    lambda x: min(x['order_number'])).reset_index(name='last_basket')

baskets_since_1st = baskets_since_1st.merge(user_norders, on='user_id', how='left')
baskets_since_1st['baskets_since_1st'] = baskets_since_1st['user_norders'] - baskets_since_1st['last_basket']
del baskets_since_1st['user_norders'], baskets_since_1st['last_basket']
print "done."

	baskets since first order of product ...  done.


In [37]:
#   6: Mean basket position. Lower basket position means customer likes product?
print "\tmean basket position ... ",
baskets_pos = prior_gb_up.apply(
    lambda x: x['add_to_cart_order'].mean()).reset_index(name='basket_pos')
print "done."

	mean basket position ...  done.


In [38]:
#   7/8/9: Time of day / dow / days since prior order
print "\torder features ... ",
order_features = orders[orders['eval_set'] != 'prior'][[
    'user_id',
    'order_hour_of_day',
    'order_dow',
    'days_since_prior_order']]

assert len(order_features) == len(orders['user_id'].unique())
del prior_gb_up
print "done."

	order features ...  done.


In [39]:
#    10: Does user reorder in general?
print "\tuser reorder rate ... ",
user_reorder_rate = prior_gb_u.apply(
    lambda x: x['reordered'].sum() / 1.0 / len(x)
    ).reset_index(name='user_reorder_rate')
print "done."

	user reorder rate ...  done.


In [40]:
#   11: User order rate
print "\tuser general order rate (days) ... ",
user_order_dt = prior_gb_u.apply(
    lambda x: x['days_since_prior_order'].mean()
    ).reset_index(name='user_order_dt')
print "done."

	user general order rate (days) ...  done.


In [44]:
#
# Assemble all features together in one table
#
features = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(user_prod_rate, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_since_1st, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_pos, on=['user_id', 'product_id'], how='left'            
    ).merge(prod_features, on='product_id', how='left'
    ).merge(order_features, on='user_id', how='left'
    ).merge(user_reorder_rate, on='user_id', how='left'
    ).merge(user_order_dt, on='user_id', how='left')
# need to keep "product features" for submission stage

# Add training data, i.e., whether product was reordered by user
features = features.merge(
    train[['user_id','product_id','reordered']],
    on=['user_id','product_id'], how='left').fillna(0)

features.head(10)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,user_prod_rate,baskets_since,baskets_since_1st,basket_pos,reorder_prob,dept_reorder_prob,department_id,is_organic,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,user_order_dt,reordered
0,1,196,10,10,1.0,0,9,1.4,0.152625,0.65346,0,0,8,4,14.0,0.694915,20.259259,1.0
1,1,10258,9,10,0.9,0,8,3.333333,0.140036,0.57418,0,0,8,4,14.0,0.694915,20.259259,1.0
2,1,10326,1,10,0.1,5,5,5.0,0.147166,0.649913,1,1,8,4,14.0,0.694915,20.259259,0.0
3,1,12427,10,10,1.0,0,9,3.3,0.095295,0.57418,0,0,8,4,14.0,0.694915,20.259259,0.0
4,1,13032,3,10,0.3,0,8,6.333333,0.090202,0.560922,0,0,8,4,14.0,0.694915,20.259259,1.0
5,1,13176,2,10,0.2,5,8,6.0,0.210303,0.649913,1,1,8,4,14.0,0.694915,20.259259,0.0
6,1,14084,1,10,0.1,9,9,2.0,0.148406,0.669969,1,1,8,4,14.0,0.694915,20.259259,0.0
7,1,17122,1,10,0.1,5,5,6.0,0.104819,0.649913,1,0,8,4,14.0,0.694915,20.259259,0.0
8,1,25133,8,10,0.8,0,7,4.0,0.086335,0.669969,1,1,8,4,14.0,0.694915,20.259259,1.0
9,1,26088,2,10,0.2,8,9,4.5,0.066208,0.57418,0,0,8,4,14.0,0.694915,20.259259,1.0


In [45]:
#
# Establish separate training and validation data
#

Nval = 50000 # seems like a good number
uids = train['user_id'].unique()
uids_train = uids[:-Nval]
uids_val = uids[-Nval:]

feature_list = [
    'reorder_prob', # product features
    'dept_reorder_prob',
    'department_id',
    'is_organic',
    'user_prod_norders', # user-product features
    'user_prod_rate',
    'baskets_since',
    'baskets_since_1st',
    'basket_pos',
    'order_dow', # order features
    'order_hour_of_day',
    'days_since_prior_order',
    'user_norders', # user features
    'user_reorder_rate',
    'user_order_dt']

#
# training
#
where = features['user_id'].isin(uids_train)
X_train = features[where][feature_list].as_matrix()
y_train = features[where]['reordered'].as_matrix()

#
# validation
#
where = features['user_id'].isin(uids_val)
X_val = features[where][feature_list].as_matrix()
y_val = features[where]['reordered'].as_matrix()

#
# test / submission
#
uids_test = orders[orders['eval_set'] == 'test']['user_id'].unique()
where = features['user_id'].isin(uids_test)
X_test = features[where][feature_list].as_matrix()

print len(uids_val), len(uids_train), len(uids_test), len(orders['user_id'].unique())

50000 81209 75000 206209


In [46]:
#
# Train the decision tree model(s)
#

#
# Apparently my solution was submitted with md = 12 on accident
#

trees = []
for mss in [500, 1000, 1500]:
    for md in [6, 10, 15]:
        for mf in [4, 6]:
            print mss, md, mf
            clf = RandomForestClassifier(
                n_estimators=15, 
                random_state=0, 
                min_samples_split=mss,
                max_features=mf,
                max_depth=md,
                n_jobs=2,
                verbose=0)
            clf.fit(X_train, y_train)
            trees.append(clf)
            #export_graphviz(clf, out_file='tree.dot')
            print clf.feature_importances_

500 6 4
[  5.22782517e-02   2.52159914e-02   3.43961871e-03   9.91890721e-06
   1.23557075e-01   3.83951627e-01   3.23079028e-01   5.47985394e-02
   3.27737294e-04   2.72938627e-05   1.99104439e-05   4.92618177e-03
   1.18329493e-02   1.00582501e-02   6.47762773e-03]
500 6 6
[  6.65723224e-02   1.49988595e-02   2.11877261e-04   0.00000000e+00
   1.26078737e-01   4.60570176e-01   2.83038466e-01   2.20525100e-02
   3.76083135e-04   4.18124190e-07   1.14948418e-05   6.35394165e-03
   8.91870670e-03   1.06509019e-02   1.65506193e-04]
500 10 4
[  6.70406572e-02   2.42204249e-02   3.29380856e-03   9.75898790e-05
   1.32678259e-01   3.49908410e-01   3.07467015e-01   5.62154126e-02
   3.12369682e-03   2.72499357e-04   5.18852115e-04   1.78814098e-02
   1.34826465e-02   1.87612245e-02   5.03809399e-03]
500 10 6
[  8.49194777e-02   1.45305737e-02   5.20305989e-04   4.59296106e-05
   9.90236992e-02   4.43157314e-01   2.74304740e-01   2.77079756e-02
   1.76183099e-03   2.25897396e-04   5.02477713e

In [47]:
#
# Let's see how we did
#

# compute training target(s)
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')
train_target = target[target['user_id'].isin(uids_train)]
train_features = features[features['user_id'].isin(uids_train)]
val_target = target[target['user_id'].isin(uids_val)]
val_features = features[features['user_id'].isin(uids_val)]

for clf in trees:
    print clf
    print "Performance on training set ... "
    pred = clf.predict_proba(X_train)[:, 1]
    for p in [0.195]: # time and time again, this value wins
        train_features['prediction'] = pred > p
        out = train_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(train_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del train_features['prediction']
    
    print "Performance on validation set ... "
    pred = clf.predict_proba(X_val)[:, 1]
    for p in [0.195]:
        
        val_features['prediction'] = pred > p
        out = val_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(val_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del val_features['prediction']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
Performance on training set ... 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.195 0.371222172977
Performance on validation set ... 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.195 0.372486702285
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=6, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
Performance on training set ... 
0.195 0.375337482873
Performance on validation set ... 
0.195 0.375829243142
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)
Performance on training set ... 
0.195 0.383496478974
Performance on validation set ... 
0.195 0.381300113412
RandomFore

0.195 0.387639425926
Performance on validation set ... 
0.195 0.381690923176


In [49]:
#
# Optional. Retrain the model with the winning hyperparameters on the entire 
# training data set. I think this is always a good idea? Can it go wrong?
#   (This actually went horribly wrong, giving an *in sample* F1 score of 0.1,
#    so I'm not really sure what to make of that!)

#
# This model was somewhat over-fitted on the training data.
# But it had a decent validation score. let's give it more data 
# and hope for the best :/. Nothing else is working ...
#
clf = RandomForestClassifier(
                n_estimators=50, 
                random_state=0, 
                min_samples_split=500,
                max_features=6,
                max_depth=15,
                n_jobs=4,
                verbose=15)
X_all = np.vstack((X_train, X_val))
y_all = np.concatenate((y_train, y_val))
clf.fit(X_all, y_all)

building tree 1 of 50building tree 2 of 50building tree 4 of 50building tree 3 of 50





[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  3.2min


building tree 5 of 50


[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:  3.2min


building tree 6 of 50


[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:  3.2min


building tree 7 of 50


[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  3.3min


building tree 8 of 50


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  5.5min


building tree 9 of 50


[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:  5.7min


building tree 10 of 50


[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  5.7min


building tree 11 of 50


[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  5.7min


building tree 12 of 50


[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:  7.8min


building tree 13 of 50


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  8.0min


building tree 14 of 50


[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:  8.0min


building tree 15 of 50


[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:  8.0min


building tree 16 of 50


[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed: 10.2min


building tree 17 of 50


[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed: 10.2min


building tree 18 of 50


[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed: 10.3min


building tree 19 of 50


[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed: 10.3min


building tree 20 of 50


[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed: 12.5min


building tree 21 of 50


[Parallel(n_jobs=4)]: Done  18 tasks      | elapsed: 12.6min


building tree 22 of 50


[Parallel(n_jobs=4)]: Done  19 tasks      | elapsed: 12.7min


building tree 23 of 50


[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed: 12.8min


building tree 24 of 50


[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed: 14.9min


building tree 25 of 50


[Parallel(n_jobs=4)]: Done  22 tasks      | elapsed: 15.0min


building tree 26 of 50


[Parallel(n_jobs=4)]: Done  23 tasks      | elapsed: 15.0min


building tree 27 of 50


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 15.0min


building tree 28 of 50


[Parallel(n_jobs=4)]: Done  25 tasks      | elapsed: 17.4min


building tree 29 of 50


[Parallel(n_jobs=4)]: Done  26 tasks      | elapsed: 17.5min


building tree 30 of 50


[Parallel(n_jobs=4)]: Done  27 tasks      | elapsed: 17.5min


building tree 31 of 50


[Parallel(n_jobs=4)]: Done  28 tasks      | elapsed: 17.6min


building tree 32 of 50


[Parallel(n_jobs=4)]: Done  29 tasks      | elapsed: 19.6min


building tree 33 of 50


[Parallel(n_jobs=4)]: Done  30 tasks      | elapsed: 19.7min


building tree 34 of 50


[Parallel(n_jobs=4)]: Done  31 tasks      | elapsed: 19.8min


building tree 35 of 50


[Parallel(n_jobs=4)]: Done  32 tasks      | elapsed: 19.9min


building tree 36 of 50


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 21.9min


building tree 37 of 50


[Parallel(n_jobs=4)]: Done  34 tasks      | elapsed: 22.1min


building tree 38 of 50


[Parallel(n_jobs=4)]: Done  35 tasks      | elapsed: 22.2min


building tree 39 of 50


[Parallel(n_jobs=4)]: Done  36 tasks      | elapsed: 22.4min


building tree 40 of 50


[Parallel(n_jobs=4)]: Done  37 tasks      | elapsed: 24.7min


building tree 41 of 50


[Parallel(n_jobs=4)]: Done  38 tasks      | elapsed: 24.7min


building tree 42 of 50


[Parallel(n_jobs=4)]: Done  39 tasks      | elapsed: 24.9min


building tree 43 of 50


[Parallel(n_jobs=4)]: Done  40 tasks      | elapsed: 25.0min


building tree 44 of 50


[Parallel(n_jobs=4)]: Done  41 tasks      | elapsed: 26.8min


building tree 45 of 50


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 26.8min


building tree 46 of 50


[Parallel(n_jobs=4)]: Done  43 tasks      | elapsed: 26.9min


building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=4)]: Done  47 out of  50 | elapsed: 29.0min remaining:  1.9min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed: 30.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features=6, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=4, oob_score=False, random_state=0,
            verbose=15, warm_start=False)

In [51]:
# check the model hasn't changed too much

print clf.feature_importances_ 
#print trees[2].feature_importances_

uids = np.concatenate((uids_train, uids_val))

where = features['user_id'].isin(uids)
X_all = features[where][feature_list].as_matrix()
pred = clf.predict_proba(X_all)[:, 1]

this_features = features[where]
this_features['prediction'] = pred > 0.195

out = this_features.groupby('user_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')
out = out.merge(target, on='user_id', how='left')

print f1score(out['target'], out['prediction'])
#del this_features['prediction']

[  8.32157353e-02   1.45286270e-02   1.30824414e-03   3.21158235e-04
   1.27194507e-01   3.70572875e-01   2.56909167e-01   4.62078986e-02
   7.08888441e-03   1.59330826e-03   3.45355702e-03   2.69773994e-02
   1.56138169e-02   3.22271150e-02   1.27877073e-02]


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    6.6s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    6.8s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    6.8s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:   11.9s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:   11.9s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:   11.9s
[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed:   12.0s
[Parallel(n_jobs=4)]: Do

   user_id                                         prediction  \
0        1  {196, 46149, 13032, 39657, 12427, 25133, 35951...   
1        2  {7781, 47209, 22124, 21709, 16589, 19156, 1559...   
2        5  {11777, 40706, 13988, 21413, 8518, 26604, 4369...   
3        7  {37602, 31683, 12196, 17638, 21137, 47272, 276...   
4        8  {8193, 17794, 18531, 9839, 32030, 34358, 28985...   

                                              target  
0  {196, 26405, 13032, 39657, 25133, 38928, 26088...  
1  {41787, 33957, 22825, 45066, 16589, 45613, 229...  
2                       {21616, 40706, 15349, 21413}  
3  {29894, 17638, 47272, 45066, 13198, 37999, 408...  
4                       {15937, 41540, 23165, 21903}  


TypeError: unsupported operand type(s) for &: 'set' and 'float'

In [162]:
trees[4]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=4, oob_score=False, random_state=0,
            verbose=5, warm_start=False)

In [67]:
#
# Let's make a submission!!
#
print clf
#clf = trees[4]

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
pred = clf.predict_proba(X_test)[:, 1] > 0.195
this_features = features[features['user_id'].isin(uids_test)]
this_features['prediction'] = pred
this_features = this_features.merge(oid_uid_test, on='user_id', how='left')
out = this_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features=6, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=4, oob_score=False, random_state=0,
            verbose=15, warm_start=False)


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:    6.6s
[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Do

Unnamed: 0,order_id,prediction
0,17,"{13107, 21463}"
1,34,"{47792, 2596, 44632, 39180, 43504, 21137, 1608..."
2,137,"{38689, 25890, 5134, 23794, 24852, 2326, 41787}"
3,182,"{11520, 5479, 33000, 47209, 39275, 32109, 4114..."
4,257,"{27104, 49235, 24838, 39475, 29837, 13870, 211..."


In [68]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()

In [168]:
#
#
# POST MORTEM: WHAT WENT WRONG?
#   what does the misclassified data look like?
#   what features could we add to better classify them?
#
this_features = features[features['user_id'].isin(uids_train)]
clf = trees[2]
pred = clf.predict_proba(X_train)[:, 1]
this_features['prediction'] = pred > 0.195
this_features.head()

Performance on training set ... 
   user_id  product_id  user_prod_norders  user_norders  user_prod_rate  \
0        1         196                 10            10             1.0   
1        1       10258                  9            10             0.9   
2        1       10326                  1            10             0.1   
3        1       12427                 10            10             1.0   
4        1       13032                  3            10             0.3   

   baskets_since  basket_pos  reorder_prob  order_hour_of_day  order_dow  \
0              0    1.400000      0.152625                  8          4   
1              0    3.333333      0.140036                  8          4   
2              5    5.000000      0.147166                  8          4   
3              0    3.300000      0.095295                  8          4   
4              0    6.333333      0.090202                  8          4   

   days_since_prior_order  user_reorder_rate  user_order_dt

[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   11.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.8min finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,user_prod_rate,baskets_since,basket_pos,reorder_prob,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,user_order_dt,reordered,prediction
0,1,196,10,10,1.0,0,1.4,0.152625,8,4,14.0,4.1,20.259259,1.0,True
1,1,10258,9,10,0.9,0,3.333333,0.140036,8,4,14.0,4.1,20.259259,1.0,True
2,1,10326,1,10,0.1,5,5.0,0.147166,8,4,14.0,4.1,20.259259,0.0,False
3,1,12427,10,10,1.0,0,3.3,0.095295,8,4,14.0,4.1,20.259259,0.0,True
4,1,13032,3,10,0.3,0,6.333333,0.090202,8,4,14.0,4.1,20.259259,1.0,True


In [181]:
errors = this_features.groupby('product_id').apply(
    lambda x: pd.Series(
        {'false_pos':sum((x['prediction'] == 1) & (x['reordered'] == 0)),
         'false_neg':sum((x['prediction'] == 0) & (x['reordered'] == 1)),
         'true_pos':sum((x['prediction'] == 1) & (x['reordered'] == 1)),
         'true_neg':sum((x['prediction'] == 0) & (x['reordered'] == 0))})
).reset_index().merge(pd.read_csv('products.csv'), on='product_id')

Unnamed: 0_level_0,false_neg,false_pos,true_neg,true_pos
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8,27,234,23
2,0,0,33,0
3,0,1,22,3
4,1,3,57,7
5,0,0,2,1
6,0,0,1,0
7,0,0,6,0
8,2,4,25,2
9,0,2,23,2
10,14,19,443,35


In [186]:
errors.sort_values('false_neg', ascending=0)


Unnamed: 0,product_id,false_neg,false_pos,true_neg,true_pos,product_name,aisle_id,department_id
25663,26209,568,2745,12352,2123,Limes,24,4
20694,21137,555,5524,12483,4728,Organic Strawberries,24,4
46628,47626,460,4111,10457,3230,Large Lemon,24,4
21446,21903,437,5957,10753,4511,Organic Baby Spinach,123,4
24446,24964,436,2120,9956,1448,Organic Garlic,83,4
38443,39275,379,1852,10809,1649,Organic Blueberries,123,4
30857,31506,375,280,9497,264,Extra Virgin Olive Oil,19,13
4509,4605,373,1622,8582,1205,Yellow Onions,83,4
22459,22935,372,2141,9431,1526,Organic Yellow Onion,83,4
16429,16797,368,3107,10883,2610,Strawberries,24,4


In [187]:
clf.feature_importances_

array([ 0.07906365,  0.03530213,  0.23368996,  0.1513633 ,  0.06063451,
        0.14184713,  0.02117745,  0.04417509,  0.05227964,  0.08449257,
        0.09597457])

In [None]:
feature_list = [
    'user_prod_norders',
    'user_norders',
    'user_prod_rate',
    'baskets_since',
    'basket_pos',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate',
    'user_order_dt']