In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
import pylab
pylab.rcParams['figure.figsize'] = (10, 6)

In [3]:
#
# Read in training (and validation) data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
prior = prior.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# compute training target
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')

print "done."

 Reading and merging orders tables ...  done.


In [31]:
#
# Before splitting off the validation set, compute "product features".
# These are features that depend only on product_id; their values are the
# same for training, validation and testing.
#
nprods = len(products['product_id'].unique())
prod_features = pd.DataFrame(
    index=range(nprods),
    columns=['product_id'])
prod_features['product_id'] = products['product_id'].unique()
prod_features.head()

Unnamed: 0,product_id
0,1
1,2
2,3
3,4
4,5


In [32]:
#
#   2: Department reorder probability
#
x = prior.groupby('department_id').apply(
    lambda x: sum(x['reordered']) / 1.0 / len(x) ).reset_index(name='dept_reorder_prob')
x = x.merge(products, on='department_id', how='left')

prod_features = prod_features.merge(
    x[['product_id', 'dept_reorder_prob']], on='product_id', how='left')

prod_features.head()
#
# CHECKME: Why do these numbers seem so different from (1)??
#

#
# ANSWER: Because you are computing slightly different things. The
# query below gives an number more similar to the category reorder
# rate. See also below where I now compute prod prob differently.
#

#   number of users who ever had this product in their basket
# x = prior.groupby('product_id').apply(
#    lambda x: sum(x['reordered']) / 1.0 / len(x)).reset_index(name='reorder_prob2')

# prod_features = prod_features.merge(
#    x[['product_id', 'reorder_prob2']], on='product_id', how='left')
# prod_features.head()

Unnamed: 0,product_id,dept_reorder_prob
0,1,0.57418
1,2,0.346721
2,3,0.65346
3,4,0.541885
4,5,0.346721


In [33]:
#
#   3: Department ID heuristic split
#
prod_features = prod_features.merge(
    products[['product_id', 'department_id']], on='product_id', how='left')
prod_features['department_id'] = prod_features.apply(
    lambda x: 1 if x['department_id'] in [4, 16] else 0, axis=1)
prod_features.head()

# sanity check
# prod_features[prod_features['department_id'] == 1].merge(
#    products, on='product_id', how='left')['department_id_y'].unique()

Unnamed: 0,product_id,dept_reorder_prob,department_id
0,1,0.57418,0
1,2,0.346721,0
2,3,0.65346,0
3,4,0.541885,0
4,5,0.346721,0


In [34]:
#
#   4: People love to reorder organic 
#
prod_features = prod_features.merge(
    products[['product_id', 'product_name']], on='product_id', how='left')
prod_features['is_organic'] = prod_features.apply(
    lambda x: 1 if 'organic' in x['product_name'].lower() else 0, axis=1)
del prod_features['product_name']
prod_features.head()

Unnamed: 0,product_id,dept_reorder_prob,department_id,is_organic
0,1,0.57418,0,0
1,2,0.346721,0,0
2,3,0.65346,0,0
3,4,0.541885,0,0
4,5,0.346721,0,0


In [53]:
#
#   5: Some items are ordered at certain times? 
#
prod_features = prod_features.merge(
    prior.groupby('product_id').apply(
        lambda x: x[x['reordered']==1]['order_hour_of_day'].mean()
        ).reset_index(name='mean_hod'),
    on='product_id', how='left')
prod_features.head()

Unnamed: 0,product_id,dept_reorder_prob,department_id,is_organic,reorder_prob,mean_hod
0,1,0.57418,0,0,0.185367,13.25
1,2,0.346721,0,0,0.086042,11.916667
2,3,0.65346,0,0,0.167777,11.773399
3,4,0.541885,0,0,0.195368,13.952381
4,5,0.346721,0,0,0.220588,9.222222


In [54]:
#
#   5: Some items are ordered at certain times? 
#
prod_features = prod_features.merge(
    prior.groupby('product_id').apply(
        lambda x: x[x['reordered']==1]['order_dow'].mean()
        ).reset_index(name='mean_dow'),
    on='product_id', how='left')
prod_features.head()

Unnamed: 0,product_id,dept_reorder_prob,department_id,is_organic,reorder_prob,mean_hod,mean_dow
0,1,0.57418,0,0,0.185367,13.25,2.72007
1,2,0.346721,0,0,0.086042,11.916667,2.916667
2,3,0.65346,0,0,0.167777,11.773399,2.753695
3,4,0.541885,0,0,0.195368,13.952381,2.605442
4,5,0.346721,0,0,0.220588,9.222222,3.111111


In [64]:
#
#   6: Aisles --> product category? Can the DT handle a "category" feature? 
#
prod_features = prod_features.merge(
    products[['product_id','aisle_id']],
    on='product_id', how='left')
prod_features.head()

Unnamed: 0,product_id,dept_reorder_prob,department_id,is_organic,reorder_prob,mean_hod,mean_dow,aisle_id
0,1,0.57418,0,0,0.185367,13.25,2.72007,61
1,2,0.346721,0,0,0.086042,11.916667,2.916667,104
2,3,0.65346,0,0,0.167777,11.773399,2.753695,94
3,4,0.541885,0,0,0.195368,13.952381,2.605442,38
4,5,0.346721,0,0,0.220588,9.222222,3.111111,5


In [22]:
#
# Generate user features ... 
# FIXME make pretty and organized like prod_features
#

print "Computing user / user-product features ... "

prior_gb_up = prior.groupby(['user_id', 'product_id'])
prior_gb_u = prior.groupby('user_id')
# FIXME is there a way to reuse prior_gb_up here? 

 Computing user / user-product features ... 


In [12]:
#   1: Number of times product has been ordered by user
print "\tnorders of product by user ... ",
user_prod_norders = prior_gb_up.apply(len).reset_index(name='user_prod_norders')
print "done."

	norders of product by user ...  done.


In [13]:
#   2: Total number of orders by user
print "\tnorders by user ... ",
user_norders = prior_gb_u.apply(
    lambda x: len(set(x['order_id']))).reset_index(name='user_norders')
# FIXME you can also get this from orders table ... probably quicker
print "done."

	norders by user ...  done.


In [20]:
#   3: Fraction of user baskets containing product
print "\torder rate of product by user ... ",
user_prod_rate = user_prod_norders.merge(user_norders, on='user_id', how='left')
user_prod_rate['user_prod_rate'] = user_prod_rate['user_prod_norders'] / 1.0 / user_prod_rate['user_norders']

del user_prod_rate['user_prod_norders'], user_prod_rate['user_norders']
# FIXME I guess you could delete the other tables instead, and this is then a bit
# more handy to carry around, but it doesn't play nice with jupyter, where
# I want cell execution independence ...

print "done."

	order rate of product by user ...  done.


In [14]:
#   4: Number of baskets since user last ordered this item
# FIXME why is this so slow?!
print "\tbaskets since last order of product ... ",
baskets_since = prior_gb_up.apply(
    lambda x: max(x['order_number'])).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['baskets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']
print "done."

	baskets since last order of product ...  done.


In [15]:
#   5: Number of baskets since user first ordered this item
# FIXME why is this so slow?!
print "\tbaskets since first order of product ... ",
baskets_since_1st = prior_gb_up.apply(
    lambda x: min(x['order_number'])).reset_index(name='last_basket')

baskets_since_1st = baskets_since_1st.merge(user_norders, on='user_id', how='left')
baskets_since_1st['baskets_since_1st'] = baskets_since_1st['user_norders'] - baskets_since_1st['last_basket']
del baskets_since_1st['user_norders'], baskets_since_1st['last_basket']
print "done."

	baskets since first order of product ...  done.


In [35]:
#
#   1: Product reorder probability
#

# The way I was originally computing this may have been biased.
# Below is an unbiased (i.e. without peeking at the training
# set) way to estimate the probability a product is ordered in
# a basket given that it ever appeared in a basket.

x = user_prod_norders.merge(
    baskets_since_1st, on=['user_id','product_id'], how='left'
    ).groupby('product_id').apply(
    lambda x: sum( x['user_prod_norders'] ) / 1.0 / (1 + sum( x['baskets_since_1st'] ))
    ).reset_index(name='reorder_prob')

prod_features = prod_features.merge(
    x[['product_id', 'reorder_prob']], on='product_id', how='left')

In [16]:
#   6: Mean basket position. Lower basket position means customer likes product?
print "\tmean basket position ... ",
baskets_pos = prior_gb_up.apply(
    lambda x: x['add_to_cart_order'].mean()).reset_index(name='basket_pos')
print "done."

	mean basket position ...  done.


In [17]:
#   7/8/9: Time of day / dow / days since prior order
print "\torder features ... ",
order_features = orders[orders['eval_set'] != 'prior'][[
    'user_id',
    'order_hour_of_day',
    'order_dow',
    'days_since_prior_order']]

assert len(order_features) == len(orders['user_id'].unique())
print "done."

	order features ...  done.


In [18]:
#    10: Does user reorder in general?
print "\tuser reorder rate ... ",
user_reorder_rate = prior_gb_u.apply(
    lambda x: x['reordered'].sum() / 1.0 / len(x)
    ).reset_index(name='user_reorder_rate')
print "done."

	user reorder rate ...  done.


In [19]:
#   11: User order rate
print "\tuser general order rate (days) ... ",
user_order_dt = prior_gb_u.apply(
    lambda x: x['days_since_prior_order'].mean()
    ).reset_index(name='user_order_dt')
print "done."

	user general order rate (days) ...  done.


In [55]:
#   12: User product order rate.
print "\tuser product order rate (days) ... ",
def up_order_rate(x):
    mn = min(x['order_number'])
    mx = max(x['order_number']) # we actually want max over all products
                                # this will have to do for now
    if mn == mx:
        return 0
    else:
        return sum(x[x['order_number'] >= mn]['days_since_prior_order']) / 1.0 / (mx - mn)

user_prod_order_dt = prior_gb_up.apply(up_order_rate
    ).reset_index(name='user_prod_order_dt')
print "done."

	user product order rate (days) ... 

KeyboardInterrupt: 

In [65]:
#
# Assemble all features together in one table
#
features = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(user_prod_rate, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_since_1st, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_pos, on=['user_id', 'product_id'], how='left'            
    ).merge(prod_features, on='product_id', how='left'
    ).merge(order_features, on='user_id', how='left'
    ).merge(user_reorder_rate, on='user_id', how='left'
    ).merge(user_order_dt, on='user_id', how='left'
    #).merge(user_prod_order_dt, on='user_id', how='left'
    ).merge(pd.read_csv('order_streaks.csv'), on=['user_id','product_id'], how='left')
# need to keep "product features" for submission stage

# Add training data, i.e., whether product was reordered by user
features = features.merge(
    train[['user_id','product_id','reordered']],
    on=['user_id','product_id'], how='left').fillna(0)

features.head(10)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,user_prod_rate,baskets_since,baskets_since_1st,basket_pos,dept_reorder_prob,department_id,...,mean_hod,mean_dow,aisle_id,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,user_order_dt,order_streak,reordered
0,1,196,10,10,1.0,0,9,1.4,0.65346,0,...,12.409989,2.868555,77,8,4,14.0,0.694915,20.259259,5.0,1.0
1,1,10258,9,10,0.9,0,8,3.333333,0.57418,0,...,12.027358,2.715623,117,8,4,14.0,0.694915,20.259259,5.0,1.0
2,1,10326,1,10,0.1,5,5,5.0,0.649913,1,...,12.829031,2.729115,24,8,4,14.0,0.694915,20.259259,0.0,0.0
3,1,12427,10,10,1.0,0,9,3.3,0.57418,0,...,12.098812,2.643319,23,8,4,14.0,0.694915,20.259259,5.0,0.0
4,1,13032,3,10,0.3,0,8,6.333333,0.560922,0,...,12.451521,2.754564,121,8,4,14.0,0.694915,20.259259,1.0,1.0
5,1,13176,2,10,0.2,5,8,6.0,0.649913,1,...,13.342547,2.69918,24,8,4,14.0,0.694915,20.259259,0.0,0.0
6,1,14084,1,10,0.1,9,9,2.0,0.669969,1,...,13.372127,2.695272,91,8,4,14.0,0.694915,20.259259,0.0,0.0
7,1,17122,1,10,0.1,5,5,6.0,0.649913,1,...,12.629092,2.6228,24,8,4,14.0,0.694915,20.259259,0.0,0.0
8,1,25133,8,10,0.8,0,7,4.0,0.669969,1,...,12.87113,2.648495,21,8,4,14.0,0.694915,20.259259,5.0,1.0
9,1,26088,2,10,0.2,8,9,4.5,0.57418,0,...,13.042647,2.918382,23,8,4,14.0,0.694915,20.259259,0.0,1.0


In [66]:
#
# Establish separate training and validation data
#

Nval = 50000 # seems like a good number
uids = train['user_id'].unique()
uids_train = uids[:-Nval]
uids_val = uids[-Nval:]

feature_list = [
    'reorder_prob', # product features
    'dept_reorder_prob',
    'department_id',
    'is_organic',
    'aisle_id',
    'mean_hod',
    'mean_dow',
    'user_prod_norders', # user-product features
    'user_prod_rate',
    'baskets_since',
    'baskets_since_1st',
    'basket_pos',
    'order_dow', # order features
    'order_hour_of_day',
    'days_since_prior_order',
    'user_norders', # user features
    'user_reorder_rate',
    'user_order_dt',
    #'user_prod_order_dt',
    'order_streak']

#
# training
#
where = features['user_id'].isin(uids_train)
X_train = features[where][feature_list].as_matrix()
y_train = features[where]['reordered'].as_matrix()

#
# validation
#
where = features['user_id'].isin(uids_val)
X_val = features[where][feature_list].as_matrix()
y_val = features[where]['reordered'].as_matrix()

#
# test / submission
#
uids_test = orders[orders['eval_set'] == 'test']['user_id'].unique()
where = features['user_id'].isin(uids_test)
X_test = features[where][feature_list].as_matrix()

print len(uids_val), len(uids_train), len(uids_test), len(orders['user_id'].unique())

50000 81209 75000 206209


In [None]:
#
# Compute training target(s)
#
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')
train_target = target[target['user_id'].isin(uids_train)]
train_features = features[features['user_id'].isin(uids_train)]
val_target = target[target['user_id'].isin(uids_val)]
val_features = features[features['user_id'].isin(uids_val)]

In [61]:
#
# All the cool kids are using gradient boosting ...
#
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     max_depth=3, min_samples_split=500, random_state=0, verbose=5).fit(X_train, y_train)


      Iter       Train Loss   Remaining Time 
         1           0.5915           24.67m
         2           0.5321           25.03m
         3           0.5147           25.42m
         4           0.5090           25.03m
         5           0.5070           24.47m
         6           0.5057           23.72m
         7           0.5046           23.51m
         8           0.5030           23.21m
         9           0.5009           22.87m
        10           0.5002           22.35m
        11           0.4996           22.14m
        12           0.4990           21.96m
        13           0.4984           21.66m
        14           0.4981           21.29m
        15           0.4975           21.04m
        16           0.4969           20.80m
        17           0.4967           20.57m
        18           0.4961           20.32m
        19           0.4960           20.11m
        20           0.4957           19.82m
        21           0.4954           19.53m
        2

In [None]:
print "Performance on training set ... "
pred = clf.predict_proba(X_train)[:, 1]
for p in [0.195, 0.2]: # time and time again, this value wins
    train_features['prediction'] = pred > p
    out = train_features.groupby('user_id').apply(
        lambda x: set(x[x['prediction'] == True]['product_id'])
        ).reset_index(name='prediction')
    out = out.merge(train_target, on='user_id', how='left')
    f1 = f1score(out['prediction'], out['target'])
    print p, f1

# 0.195 0.376389954484
# 0.2 0.376154126071    

pred = clf.predict_proba(X_val)[:, 1]
for p in [0.195, 0.2]: # time and time again, this value wins
    val_features['prediction'] = pred > p
    out = val_features.groupby('user_id').apply(
        lambda x: set(x[x['prediction'] == True]['product_id'])
        ).reset_index(name='prediction')
    out = out.merge(val_target, on='user_id', how='left')
    f1 = f1score(out['prediction'], out['target'])
    print p, f1    

In [None]:
#
# try a bunch of hyperparameters
#
trees = []
import itertools
for mss, md, mf in itertools.product(
    [1000, 1500], 
    [10, 15, 18], 
    [4, 6]):

    clf = RandomForestClassifier(
        criterion='gini',
        n_estimators=15, 
        random_state=0, 
        min_samples_split=mss,
        max_features=mf,
        max_depth=md,
        n_jobs=4,
        verbose=0)
    print clf
    clf.fit(X_train, y_train)
    trees.append(clf)
    #export_graphviz(clf, out_file='tree.dot')
    print clf.feature_importances_
               
    print "Performance on training set ... "
    pred = clf.predict_proba(X_train)[:, 1]
    for p in [0.195, 0.2, 0.21]: # time and time again, this value wins
        train_features['prediction'] = pred > p
        out = train_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')
        out = out.merge(train_target, on='user_id', how='left')
        f1 = f1score(out['prediction'], out['target'])
        print p, f1
    
    if f1 < 0.38: continue
    print "Performance on validation set ... "
    pred = clf.predict_proba(X_val)[:, 1]
    for p in [0.195, 0.2, 0.21]:
        
        val_features['prediction'] = pred > p
        out = val_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')
        out = out.merge(val_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del val_features['prediction']

In [None]:
#
# Optional. Retrain the model with the winning hyperparameters on the entire 
# training data set. I think this is always a good idea? Can it go wrong?

#
# This model was somewhat over-fitted on the training data.
# But it had a decent validation score. let's give it more data 
# and hope for the best :/. Nothing else is working ...
#
clf = RandomForestClassifier(
                n_estimators=50, 
                random_state=0, 
                min_samples_split=500,
                max_features=6,
                max_depth=15,
                n_jobs=4,
                verbose=15)
X_all = np.vstack((X_train, X_val))
y_all = np.concatenate((y_train, y_val))
clf.fit(X_all, y_all)

In [None]:
# check the model hasn't changed too much

print clf.feature_importances_ 
#print trees[2].feature_importances_

uids = np.concatenate((uids_train, uids_val))

where = features['user_id'].isin(uids)
X_all = features[where][feature_list].as_matrix()
pred = clf.predict_proba(X_all)[:, 1]

this_features = features[where]
this_features['prediction'] = pred > 0.195

out = this_features.groupby('user_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')
out = out.merge(target, on='user_id', how='left')

print f1score(out['target'], out['prediction'])
#del this_features['prediction']

In [None]:
trees[4]

In [None]:
#
# Let's make a submission!!
#
print clf
#clf = trees[4]

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
where = features['user_id'].isin(uids_test)
test_features = features[where]

X_test = test_features[feature_list].as_matrix()
pred = clf.predict_proba(X_test)[:, 1] > 0.195

test_features.loc[:, 'prediction'] = pred
test_features = test_features.merge(oid_uid_test, on='user_id', how='left')
out = this_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

In [None]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()

In [None]:
#
#
# POST MORTEM: WHAT WENT WRONG?
#   what does the misclassified data look like?
#   what features could we add to better classify them?
#

#pred = clf.predict_proba(X_train)[:, 1]
#train_features['prediction'] = pred > 0.195
#train_features[train_features['prediction'] != train_features['reordered']].head(25)
def pd_f1score(x):
    # an F1 score routine working with dataframes
    pr = set(x[x['prediction'] == True]['product_id'])
    ob = set(x[x['reordered'] == 1]['product_id'])
    if not pr:
        pr = set([None])

    if not ob:
        ob = set([None])
        
    precision = len(pr & ob) / float(len(pr))
    recall = len(pr & ob) / float(len(ob))

    if precision == recall == 0:
        return 0
    else:
        return 2 * precision * recall / (precision + recall)
    
f1scores = train_features.groupby('user_id').apply(pd_f1score).reset_index(name='f1score')
f1scores.sort_values('f1score').head(25)

In [None]:


#print prior.groupby('user_id').apply(lambda x: len(set(x['orderf1scores['f1score'].mean()
#tmp = train_features[train_features['user_id'] == 30542].head(25)
tmp = user_reorder_rate.merge(f1scores, on='user_id')
pylab.scatter(tmp['user_reorder_rate'], tmp['f1score'], s=5)
pylab.show()
#tmp[tmp['prediction'] != tmp['reordered']].head(25)

In [None]:
errors = train_features.groupby('product_id').apply(
    lambda x: pd.Series(
        {'false_pos':sum((x['prediction'] == 1) & (x['reordered'] == 0)),
         'false_neg':sum((x['prediction'] == 0) & (x['reordered'] == 1)),
         'true_pos':sum((x['prediction'] == 1) & (x['reordered'] == 1)),
         'true_neg':sum((x['prediction'] == 0) & (x['reordered'] == 0))})
).reset_index().merge(pd.read_csv('products.csv'), on='product_id')

In [None]:
errors['recall'] = errors['true_pos'] / 1.0 / (errors['false_neg'] + errors['true_pos'])
errors['precision'] = errors['true_pos'] / 1.0 / (errors['false_pos'] + errors['true_pos'])
errors['f1score'] = 2 * errors['precision'] * errors['recall'] / (errors['precision'] + errors['recall'])

errors[errors['false_pos'] + errors['true_pos'] > 50].sort_values('f1score')


In [None]:
tmp = prior.groupby('product_id').apply(len).reset_index(name='norders')
tmp = tmp.merge(errors, on='product_id', how='inner')
tmp.sort_values('norders', ascending=0)

In [None]:
#t = prior.groupby('product_id').apply(len)
pylab.hist(t, bins=np.linspace(1, 100, 100))
pylab.show()

In [None]:
# the product f1 score is not quite the same as the user f1score
print errors['f1score'].mean()

# I don't know if one tracks the other ... hopefully

# Let's retrain part of the classifier on products we performed poorly
# on. Kind of like making a hyper-tree where this is the first split
bad_products = errors[errors['f1score'] < 0.4]['product_id']
print len(bad_products)

# Are there a lot of these? Enough to think it will matter?
print len( features[features['product_id'].isin(bad_products)] )
print len(features)

In [None]:
# Although these are only 1/10th of the product selection, they amount to
# almsot 1/3rd of the orders ...
retrain_features = train_features[train_features['product_id'].isin(bad_products)]
X_retrain = retrain_features[feature_list].as_matrix()
y_retrain = retrain_features['reordered'].as_matrix()
old_clf = clf

In [None]:
retrees = []
for mss in [500, 1000, 1500]:
    for md in [6, 10, 15]:
        for mf in [4, 6]:
            print mss, md, mf
            reclf = RandomForestClassifier(
                n_estimators=15, 
                random_state=0, 
                min_samples_split=mss,
                max_features=mf,
                max_depth=md,
                n_jobs=4,
                verbose=0)
            reclf.fit(X_retrain, y_retrain)
            retrees.append(reclf)
            #export_graphviz(clf, out_file='tree.dot')
            print reclf.feature_importances_

In [None]:
# compute training target(s)

# first, deep tree
pred = clf.predict_proba(X_train)[:, 1]
train_features['prediction'] = pred > 0.195

# first, deep tree
pred = clf.predict_proba(X_val)[:, 1]
val_features['prediction'] = pred > 0.195

In [None]:
where_t = train_features['product_id'].isin(bad_products)
X_retrain = train_features[where_t][feature_list].as_matrix()
where_v = val_features['product_id'].isin(bad_products)
X_reval = val_features[where_v][feature_list].as_matrix()

print "Original model:",
f1scores = train_features.groupby('user_id').apply(
    pd_f1score).reset_index(name='f1score')
print f1scores['f1score'].mean(), "(training)"
f1scores = val_features.groupby('user_id').apply(
    pd_f1score).reset_index(name='f1score')
print f1scores['f1score'].mean(), "(validation)"
print

print "Boosted model:"

for _clf in retrees:
    print _clf

    _pred = _clf.predict_proba(X_retrain)[:, 1]
    for p in [0.195]: # time and time again, this value wins
        
        train_features.loc[where_t, 'prediction'] = _pred > p
        f1scores = train_features.groupby('user_id').apply(
            pd_f1score).reset_index(name='f1score')
        print f1scores['f1score'].mean(), "(training)", "(training)"        
        
        #f1scores = train_features.groupby('user_id').apply(
        #    pd_f1score).reset_index(name='f1score')
        #print p, f1scores['f1score'].mean()
    
    print "Performance on validation set ... "
    _pred = _clf.predict_proba(X_reval)[:, 1]
    for p in [0.195]:
        
        val_features.loc[where_v, 'prediction'] = _pred > p
        f1scores = val_features.groupby('user_id').apply(
            pd_f1score).reset_index(name='f1score')
        print p, f1scores['f1score'].mean()

In [None]:
#
# Let's make a submission!!
#
print clf
_clf = trees[1]
print _clf

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
where = features['user_id'].isin(uids_test)
test_features = features[where]
where_p = test_features['product_id'].isin(bad_products)

X_test = test_features[feature_list].as_matrix()
pred = clf.predict_proba(X_test)[:, 1] > 0.195

test_features.loc[:, 'prediction'] = pred

X_retest = test_features[where_p][feature_list].as_matrix()
_pred = _clf.predict_proba(X_retest)[:, 1] > 0.195
test_features.loc[where_p, 'prediction'] = _pred

test_features = test_features.merge(oid_uid_test, on='user_id', how='left')

In [None]:
out = test_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

In [None]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()