In [125]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [45]:
#
# Read in training (and validation) data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
prior = prior.merge(orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# compute training target
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')

print "done."

Reading and merging orders tables ...  done.


In [47]:
#
# Before splitting off the validation set, compute "product features".
# These are features that depend only on product_id; their values are the
# same for training, validation and testing.
#

print "Computing product features ... ",

#   number of users who ever had this product in their basket
x = prior.groupby('product_id').apply(
    lambda x: len(set(x['user_id']))).reset_index(name='nusers')

# number of users that reorderd the item
y = train.groupby('product_id').apply(
    lambda x: x['reordered'].sum()).reset_index(name='reordered')

reorder_prob = x.merge(y, on='product_id', how='left').fillna(0)
reorder_prob['reorder_prob'] = reorder_prob['reordered'] / reorder_prob['nusers']
del x, y, reorder_prob['reordered'], reorder_prob['nusers']

#
# FIXME: try inter-purchase time?
#

print "done."

Computing product features ...  done.


In [111]:
#
# Generate features ...
#

print "Computing user / user-product features ... "

prior_gb_up = prior.groupby(['user_id', 'product_id'])
prior_gb_u = prior.groupby('user_id')
# FIXME is there a way to reuse prior_gb_up here? 

Computing user / user-product features ... 


In [110]:
#   1: Number of times product has been ordered by user
print "\tnorders of product by user ... ",
user_prod_norders = prior_gb_up.apply(len).reset_index(name='user_prod_norders')
print "done."

 Computing user / user-product features ... 
	norders of product by user ...  done.


In [108]:
#   2: Total number of orders by user
print "\tnorders by user ... ",
user_norders = prior_gb_u.apply(
    lambda x: len(set(x['order_id']))).reset_index(name='user_norders')
# FIXME you can also get this from orders table ... probably quicker
print "done."

	norders by user ...  done.


In [134]:
#   3: Fraction of user baskets containing product
print "\torder rate of product by user ... ",
user_prod_rate = user_prod_norders.merge(user_norders, on='user_id', how='left')
user_prod_rate['user_prod_rate'] = user_prod_rate['user_prod_norders'] / 1.0 / user_prod_rate['user_norders']

del user_prod_rate['user_prod_norders'], user_prod_rate['user_norders']
# FIXME I guess you could delete the other tables instead, and this is then a bit
# more handy to carry around, but it doesn't play nice with jupyter, where
# I want cell execution independence ...

print "done."

	order rate of product by user ...  done.


In [112]:
#   4: Number of baskets since user last ordered this item
print "\tbaskets since last order of product ... ",
baskets_since = prior_gb_up.apply(
    lambda x: max(x['order_number'])).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['baskets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']
print "done."

	baskets since last order of product ...  done.


In [101]:
#   5: Mean basket position. Lower basket position means customer likes product?
print "\tmean basket position ... ",
baskets_pos = prior_gb_up.apply(
    lambda x: x['add_to_cart_order'].mean()).reset_index(name='basket_pos')
print "done."

	mean basket position ...  done.


In [102]:
#   6/7/8: Time of day / dow / days since prior order
print "\torder features ... ",
order_features = orders[orders['eval_set'] != 'prior'][[
    'user_id',
    'order_hour_of_day',
    'order_dow',
    'days_since_prior_order']]

assert len(order_features) == len(orders['user_id'].unique())
del prior_gb_up
print "done."

	order features ...  done.


In [103]:
#    9: Does user reorder in general?
print "\tuser reorder rate ... ",
user_reorder_rate = prior_gb_u.apply(
    lambda x: x['reordered'].sum() / 1.0 / len(set(x['order_id']))
    ).reset_index(name='user_reorder_rate')
print "done."

	user reorder rate ...  done.


In [138]:
#   10: User order rate
print "\tuser general order rate (days) ... ",
user_order_dt = prior_gb_u.apply(
    lambda x: x['days_since_prior_order'].mean()
    ).reset_index(name='user_order_dt')
print "done."

	user general order rate (days) ...  done.


In [155]:
#
# Assemble all features together in one table
#
features = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(user_prod_rate, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_pos, on=['user_id', 'product_id'], how='left'            
    ).merge(reorder_prob, on='product_id', how='left'
    ).merge(order_features, on='user_id', how='left'
    ).merge(user_reorder_rate, on='user_id', how='left'
    ).merge(user_order_dt, on='user_id', how='left')
# need to keep "product features" for submission stage

# Add training data, i.e., whether product was reordered by user
features = features.merge(
    train[['user_id','product_id','reordered']],
    on=['user_id','product_id'], how='left').fillna(0)

features.head(10)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,user_prod_rate,baskets_since,basket_pos,reorder_prob,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,user_order_dt,reordered
0,1,196,10,10,1.0,0,1.4,0.152625,8,4,14.0,4.1,20.259259,1.0
1,1,10258,9,10,0.9,0,3.333333,0.140036,8,4,14.0,4.1,20.259259,1.0
2,1,10326,1,10,0.1,5,5.0,0.147166,8,4,14.0,4.1,20.259259,0.0
3,1,12427,10,10,1.0,0,3.3,0.095295,8,4,14.0,4.1,20.259259,0.0
4,1,13032,3,10,0.3,0,6.333333,0.090202,8,4,14.0,4.1,20.259259,1.0
5,1,13176,2,10,0.2,5,6.0,0.210303,8,4,14.0,4.1,20.259259,0.0
6,1,14084,1,10,0.1,9,2.0,0.148406,8,4,14.0,4.1,20.259259,0.0
7,1,17122,1,10,0.1,5,6.0,0.104819,8,4,14.0,4.1,20.259259,0.0
8,1,25133,8,10,0.8,0,4.0,0.086335,8,4,14.0,4.1,20.259259,1.0
9,1,26088,2,10,0.2,8,4.5,0.066208,8,4,14.0,4.1,20.259259,1.0


In [157]:
#
# Establish separate training and validation data
#

Nval = 50000 # seems like a good number
uids = train['user_id'].unique()
uids_train = uids[:-Nval]
uids_val = uids[-Nval:]

feature_list = [
    'user_prod_norders',
    'user_norders',
    'user_prod_rate',
    'baskets_since',
    'basket_pos',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate',
    'user_order_dt']

#
# training
#
where = features['user_id'].isin(uids_train)
X_train = features[where][feature_list].as_matrix()
y_train = features[where]['reordered'].as_matrix()

#
# validation
#
where = features['user_id'].isin(uids_val)
X_val = features[where][feature_list].as_matrix()
y_val = features[where]['reordered'].as_matrix()

#
# test / submission
#
uids_test = orders[orders['eval_set'] == 'test']['user_id'].unique()
where = features['user_id'].isin(uids_test)
X_test = features[where][feature_list].as_matrix()

print len(uids_val), len(uids_train), len(uids_test), len(orders['user_id'].unique())

50000 81209 75000 206209


In [159]:
#
# Train the decision tree model(s)
#

#
# Apparently my solution was submitted with md = 12 on accident
#

trees = []
for mss in [50, 500, 5000]:
    for md in [12, None]:
        for mf in [4, 6]:
            print mss, md, mf
            clf = RandomForestClassifier(
                n_estimators=100, 
                random_state=0, 
                min_samples_split=mss,
                max_features=mf,
                max_depth=md,
                n_jobs=4,
                verbose=5)
            clf.fit(X_train, y_train)
            trees.append(clf)
            #export_graphviz(clf, out_file='tree.dot')
            print clf.feature_importances_

50 12 4
building tree 1 of 100
building tree 2 of 100building tree 3 of 100building tree 4 of 100


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.1min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 10.6min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 16.3min finished


[ 0.12919287  0.02680279  0.38718137  0.28206054  0.00887533  0.10074782
  0.00180616  0.00370254  0.02784887  0.01818769  0.01359403]
50 12 6
building tree 1 of 100
building tree 2 of 100building tree 4 of 100building tree 3 of 100


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.7min


building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 19 of 100building tree 18 of 100

building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 14.6min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 22.5min finished


[ 0.11424764  0.01605164  0.4191254   0.28218635  0.00738658  0.09576013
  0.00171718  0.00378355  0.03000439  0.01675919  0.01297795]
50 None 4
building tree 2 of 100building tree 1 of 100 building tree 4 of 100
building tree 3 of 100


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.8min


building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 15.4min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 24.1min finished


[ 0.07906365  0.03530213  0.23368996  0.1513633   0.06063451  0.14184713
  0.02117745  0.04417509  0.05227964  0.08449257  0.09597457]
50 None 6
building tree 1 of 100building tree 2 of 100 building tree 4 of 100

building tree 3 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  4.3min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 23.5min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 36.5min finished


[ 0.065313    0.03024887  0.24656317  0.14709097  0.0618758   0.14004905
  0.02065692  0.04456546  0.05145966  0.08883011  0.10334699]
500 12 4
building tree 2 of 100building tree 1 of 100building tree 3 of 100

 building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.0min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 10.9min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 16.7min finished


[ 0.1239648   0.02834273  0.40761617  0.27897693  0.00642513  0.10064019
  0.00092859  0.0019738   0.02661312  0.01468128  0.00983726]
500 12 6
building tree 1 of 100building tree 3 of 100building tree 2 of 100


building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.8min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100building tree 38 of 100

building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 15.2min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 23.5min finished


[ 0.11433702  0.01619295  0.43190204  0.28224086  0.00513427  0.09552426
  0.00079058  0.00183559  0.02958027  0.01335633  0.00910584]
500 None 4
building tree 3 of 100building tree 2 of 100building tree 1 of 100
building tree 4 of 100


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.7min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 15.1min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 23.4min finished


[ 0.11607947  0.02467049  0.35931972  0.25479638  0.01909415  0.11111304
  0.00469568  0.01051726  0.03463754  0.03361225  0.03146402]
500 None 6
building tree 1 of 100building tree 3 of 100building tree 2 of 100
 
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  4.1min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 22.1min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 34.3min finished


[ 0.10387886  0.01859397  0.38427822  0.24630599  0.01915283  0.10723546
  0.00457629  0.01102286  0.0357928   0.03432062  0.03484209]
5000 12 4
building tree 1 of 100building tree 2 of 100

building tree 3 of 100
 building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.9min


building tree 14 of 100building tree 15 of 100

building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 10.3min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 15.9min finished


[  1.32884146e-01   2.54406472e-02   4.22473019e-01   2.87793978e-01
   3.38550480e-03   9.30411224e-02   1.67764725e-04   2.68333116e-04
   1.97463922e-02   1.03315137e-02   4.46757862e-03]
5000 12 6
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.7min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 14.6min


building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 22.5min finished


[  1.20936650e-01   1.47291951e-02   4.51443184e-01   2.87562839e-01
   1.38614192e-03   9.02938071e-02   1.08556196e-04   1.95844546e-04
   2.20902526e-02   8.28707559e-03   2.96645414e-03]
5000 None 4
building tree 1 of 100
building tree 2 of 100 
building tree 4 of 100
building tree 3 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.2min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 12.1min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 18.7min finished


[  1.25186013e-01   2.46596814e-02   4.22157377e-01   2.91267188e-01
   3.71796169e-03   9.49761049e-02   2.94862750e-04   5.34818635e-04
   2.05602019e-02   1.13523541e-02   5.29343659e-03]
5000 None 6
building tree 4 of 100
building tree 3 of 100building tree 1 of 100building tree 2 of 100


building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  3.2min


building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55

[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 17.6min


building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 27.1min finished


[  1.16970661e-01   1.42823410e-02   4.44566696e-01   2.93409776e-01
   2.21490613e-03   9.13913065e-02   2.39609153e-04   4.70489548e-04
   2.24684769e-02   9.81356863e-03   4.17216927e-03]


In [161]:
clf.feature_importances_

array([  1.16970661e-01,   1.42823410e-02,   4.44566696e-01,
         2.93409776e-01,   2.21490613e-03,   9.13913065e-02,
         2.39609153e-04,   4.70489548e-04,   2.24684769e-02,
         9.81356863e-03,   4.17216927e-03])

In [160]:
#
# Let's see how we covered the training set
#
print "Performance on training set ... "
this_target = target[target['user_id'].isin(uids_train)]
this_features = features[features['user_id'].isin(uids_train)]
for clf in trees:
    pred = clf.predict_proba(X_train)[:, 1]
    for p in [0.195]: # time and time again, this value wins
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']
    
#
# Now, how do we perform out of sample?
#
print "Performance on validation set ... "
this_target = target[target['user_id'].isin(uids_val)]
this_features = features[features['user_id'].isin(uids_val)]
for clf in trees:
    pred = clf.predict_proba(X_val)[:, 1]
    for p in [0.195]:
        
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']

Performance on training set ... 


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   29.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   47.6s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


0.195 0.388467974111


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   26.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   45.6s finished


0.195 0.388522909115


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.7min finished


0.195 0.549161874823


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.9s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.7min finished


0.195 0.564119586729


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   25.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   42.4s finished


0.195 0.386366775228


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   27.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   46.2s finished


0.195 0.386500302265


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   42.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.2min finished


0.195 0.401955406092


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.8s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   44.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.2min finished


0.195 

[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   24.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   41.4s finished


0.404761028802
0.195 0.381601178402

[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   24.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   42.2s finished



0.195 0.382120796345


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   29.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   48.4s finished


0.195 0.382199472717


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   31.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   50.5s finished


0.195 0.382448853712
Performance on validation set ... 


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   16.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   26.6s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.195 0.381724278316


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   13.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   26.6s finished


0.195 0.381603490798


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   36.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   59.3s finished


0.195 0.375994591223


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.9s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   38.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.0min finished


0.195 0.374776859015


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   24.9s finished


0.195 0.381584697913


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   11.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   24.2s finished


0.195 0.381180183036


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   21.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   37.6s finished


0.195 0.381290996513


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   22.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   39.7s finished


0.195 0.380858086626


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   20.7s finished


0.195 0.380255053615


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   21.4s finished


0.195 0.381088283796


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   12.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   23.9s finished


0.195 0.380691221314


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   12.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   25.7s finished


0.195 0.381236119921


In [144]:
#
# Optional. Retrain the model with the winning hyperparameters on the entire 
# training data set. I think this is always a good idea? Can it go wrong?
#
clf = RandomForestClassifier(
    n_estimators=100, 
    random_state=0, 
    min_samples_split=1000,
    n_jobs=3,
    verbose=3)
X_all = np.vstack((X_train, X_val))
y_all = np.concatenate((y_train, y_val))
clf.fit(X_all, y_all)

building tree 2 of 100
building tree 1 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100


[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  7.5min


building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70

[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed: 28.7min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=1000, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=3, oob_score=False, random_state=0,
            verbose=3, warm_start=False)

In [153]:
# check the model hasn't changed too much

print clf.feature_importances_ 
print trees[2].feature_importances_

pred = trees[2].predict_proba(X_all)[:, 1]
uids = np.concatenate((uids_train, uids_val))
this_features = features[features['user_id'].isin(uids)]
this_features['prediction'] = pred > 0.195
this_target = target
out = this_features.groupby('user_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out = out.merge(this_target, on='user_id', how='left')

print out.head()
print f1score(out['prediction'], out['target'])
#del this_features['prediction']

[ 0.297212    0.0758229   0.38044754  0.15007322  0.00612828  0.01259436
  0.03979964  0.03792204]
[ 0.297212    0.0758229   0.38044754  0.15007322  0.00612828  0.01259436
  0.03979964  0.03792204]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


   user_id                                         prediction  \
0        1  {196, 46149, 13032, 39657, 12427, 25133, 35951...   
1        2  {47209, 7781, 22825, 32139, 22124, 16589, 2170...   
2        5                        {6808, 40706, 22475, 15349}   
3        7  {13249, 39275, 45537, 15592, 29993, 27690, 190...   
4        8  {18531, 24838, 33640, 6473, 651, 18479, 14992,...   

                                              target  
0  {196, 26405, 13032, 39657, 25133, 38928, 26088...  
1  {41787, 33957, 22825, 45066, 16589, 45613, 229...  
2                       {21616, 40706, 15349, 21413}  
3  {29894, 17638, 47272, 45066, 13198, 37999, 408...  
4                       {15937, 41540, 23165, 21903}  
0.113657192067


In [162]:
trees[4]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=4, oob_score=False, random_state=0,
            verbose=5, warm_start=False)

In [163]:
#
# Let's make a submission!!
#

clf = trees[4]

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
pred = clf.predict_proba(X_test)[:, 1] > 0.195
this_features = features[features['user_id'].isin(uids_test)]
this_features['prediction'] = pred
this_features = this_features.merge(oid_uid_test, on='user_id', how='left')
out = this_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   23.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   38.0s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,order_id,prediction
0,17,"{13107, 21709, 47766, 21463}"
1,34,"{47792, 44632, 39180, 43504, 21137, 16083, 477..."
2,137,"{38689, 25890, 5134, 23794, 24852, 2326, 29594..."
3,182,"{11520, 5479, 33000, 47209, 39275, 47672, 9337..."
4,257,"{27104, 49235, 24838, 39475, 29837, 13870, 211..."


In [164]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()