In [191]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [192]:
#
# Read in training (and validation) data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
prior = prior.merge(orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# compute training target
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')

print "done."

Reading and merging orders tables ...  done.


In [193]:
#
# Before splitting off the validation set, compute "product features".
# These are features that depend only on product_id; their values are the
# same for training, validation and testing.
#

print "Computing product features ... ",

#   number of users who ever had this product in their basket
x = prior.groupby('product_id').apply(
    lambda x: len(set(x['user_id']))).reset_index(name='nusers')

# number of users that reorderd the item
y = train.groupby('product_id').apply(
    lambda x: x['reordered'].sum()).reset_index(name='reordered')

reorder_prob = x.merge(y, on='product_id', how='left').fillna(0)
reorder_prob['reorder_prob'] = reorder_prob['reordered'] / reorder_prob['nusers']
del x, y, reorder_prob['reordered'], reorder_prob['nusers']

#
# FIXME: try inter-purchase time?
#

print "done."

Computing product features ...  done.


In [194]:
#
# Generate features ...
#

print "Computing user / user-product features ... "

prior_gb_up = prior.groupby(['user_id', 'product_id'])
prior_gb_u = prior.groupby('user_id')
# FIXME is there a way to reuse prior_gb_up here? 

Computing user / user-product features ... 


In [195]:
#   1: Number of times product has been ordered by user
print "\tnorders of product by user ... ",
user_prod_norders = prior_gb_up.apply(len).reset_index(name='user_prod_norders')
print "done."

	norders of product by user ...  done.


In [196]:
#   2: Total number of orders by user
print "\tnorders by user ... ",
user_norders = prior_gb_u.apply(
    lambda x: len(set(x['order_id']))).reset_index(name='user_norders')
# FIXME you can also get this from orders table ... probably quicker
print "done."

	norders by user ...  done.


In [197]:
#   3: Fraction of user baskets containing product
print "\torder rate of product by user ... ",
user_prod_rate = user_prod_norders.merge(user_norders, on='user_id', how='left')
user_prod_rate['user_prod_rate'] = user_prod_rate['user_prod_norders'] / 1.0 / user_prod_rate['user_norders']

del user_prod_rate['user_prod_norders'], user_prod_rate['user_norders']
# FIXME I guess you could delete the other tables instead, and this is then a bit
# more handy to carry around, but it doesn't play nice with jupyter, where
# I want cell execution independence ...

print "done."

	order rate of product by user ...  done.


In [198]:
#   4: Number of baskets since user last ordered this item
print "\tbaskets since last order of product ... ",
baskets_since = prior_gb_up.apply(
    lambda x: max(x['order_number'])).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['baskets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']
print "done."

	baskets since last order of product ...  done.


In [199]:
#   5: Mean basket position. Lower basket position means customer likes product?
print "\tmean basket position ... ",
baskets_pos = prior_gb_up.apply(
    lambda x: x['add_to_cart_order'].mean()).reset_index(name='basket_pos')
print "done."

	mean basket position ...  done.


In [200]:
#   6/7/8: Time of day / dow / days since prior order
print "\torder features ... ",
order_features = orders[orders['eval_set'] != 'prior'][[
    'user_id',
    'order_hour_of_day',
    'order_dow',
    'days_since_prior_order']]

assert len(order_features) == len(orders['user_id'].unique())
del prior_gb_up
print "done."

	order features ...  done.


In [201]:
#    9: Does user reorder in general?
print "\tuser reorder rate ... ",
user_reorder_rate = prior_gb_u.apply(
    lambda x: x['reordered'].sum() / 1.0 / len(set(x['order_id']))
    ).reset_index(name='user_reorder_rate')
print "done."

	user reorder rate ...  done.


In [202]:
#   10: User order rate
print "\tuser general order rate (days) ... ",
user_order_dt = prior_gb_u.apply(
    lambda x: x['days_since_prior_order'].mean()
    ).reset_index(name='user_order_dt')
print "done."

	user general order rate (days) ...  done.


In [203]:
#
# Assemble all features together in one table
#
features = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(user_prod_rate, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(baskets_pos, on=['user_id', 'product_id'], how='left'            
    ).merge(reorder_prob, on='product_id', how='left'
    ).merge(order_features, on='user_id', how='left'
    ).merge(user_reorder_rate, on='user_id', how='left'
    ).merge(user_order_dt, on='user_id', how='left')
# need to keep "product features" for submission stage

# Add training data, i.e., whether product was reordered by user
features = features.merge(
    train[['user_id','product_id','reordered']],
    on=['user_id','product_id'], how='left').fillna(0)

features.head(10)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,user_prod_rate,baskets_since,basket_pos,reorder_prob,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,user_order_dt,reordered
0,1,196,10,10,1.0,0,1.4,0.152625,8,4,14.0,4.1,20.259259,1.0
1,1,10258,9,10,0.9,0,3.333333,0.140036,8,4,14.0,4.1,20.259259,1.0
2,1,10326,1,10,0.1,5,5.0,0.147166,8,4,14.0,4.1,20.259259,0.0
3,1,12427,10,10,1.0,0,3.3,0.095295,8,4,14.0,4.1,20.259259,0.0
4,1,13032,3,10,0.3,0,6.333333,0.090202,8,4,14.0,4.1,20.259259,1.0
5,1,13176,2,10,0.2,5,6.0,0.210303,8,4,14.0,4.1,20.259259,0.0
6,1,14084,1,10,0.1,9,2.0,0.148406,8,4,14.0,4.1,20.259259,0.0
7,1,17122,1,10,0.1,5,6.0,0.104819,8,4,14.0,4.1,20.259259,0.0
8,1,25133,8,10,0.8,0,4.0,0.086335,8,4,14.0,4.1,20.259259,1.0
9,1,26088,2,10,0.2,8,4.5,0.066208,8,4,14.0,4.1,20.259259,1.0


In [204]:
#
# Establish separate training and validation data
#

Nval = 50000 # seems like a good number
uids = train['user_id'].unique()
uids_train = uids[:-Nval]
uids_val = uids[-Nval:]

feature_list = [
    'user_prod_norders',
    'user_norders',
    'user_prod_rate',
    'baskets_since',
    'basket_pos',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate',
    'user_order_dt']

#
# training
#
where = features['user_id'].isin(uids_train)
X_train = features[where][feature_list].as_matrix()
y_train = features[where]['reordered'].as_matrix()

#
# validation
#
where = features['user_id'].isin(uids_val)
X_val = features[where][feature_list].as_matrix()
y_val = features[where]['reordered'].as_matrix()

#
# test / submission
#
uids_test = orders[orders['eval_set'] == 'test']['user_id'].unique()
where = features['user_id'].isin(uids_test)
X_test = features[where][feature_list].as_matrix()

print len(uids_val), len(uids_train), len(uids_test), len(orders['user_id'].unique())

50000 81209 75000 206209


In [216]:
#
# Train the decision tree model(s)
#

#
# Apparently my solution was submitted with md = 12 on accident
#

trees = []
for mss in [500, 1000, 1500]:
    for md in [12, 15, 20]:
        for mf in [4, 6]:
            print mss, md, mf
            clf = RandomForestClassifier(
                n_estimators=10, 
                random_state=0, 
                min_samples_split=mss,
                max_features=mf,
                max_depth=md,
                n_jobs=4,
                class_weight={0:1.0, 1:4.0},
                verbose=0)
            clf.fit(X_train, y_train)
            trees.append(clf)
            #export_graphviz(clf, out_file='tree.dot')
            print clf.feature_importances_

500 12 4
[ 0.12046192  0.03096418  0.31571444  0.40639964  0.00324772  0.08775508
  0.000771    0.0014761   0.0156764   0.00978474  0.00774877]
500 12 6
[ 0.14690395  0.01861904  0.1985425   0.50573722  0.00307652  0.09114301
  0.00071677  0.00148692  0.01713427  0.01021767  0.00642212]
500 15 4
[ 0.14350424  0.03198534  0.27441664  0.38693112  0.00809859  0.09705579
  0.00237265  0.00470723  0.01917868  0.01706109  0.01468862]
500 15 6
[ 0.14311303  0.01073528  0.19049643  0.48859906  0.00833519  0.0986176
  0.00223062  0.00496033  0.02081556  0.01757146  0.01452546]
500 20 4
[ 0.1198622   0.03080946  0.25169073  0.39611477  0.01365086  0.10287036
  0.00419992  0.00910893  0.02352386  0.02448651  0.0236824 ]
500 20 6
[ 0.14263695  0.0136886   0.17126197  0.46732399  0.01397023  0.10171657
  0.00424285  0.0093701   0.02375863  0.02647399  0.02555612]
1000 12 4
[ 0.14110437  0.03845742  0.25119225  0.44261648  0.00336026  0.09113256
  0.00061531  0.00122192  0.01437618  0.00923172  0.00

In [None]:
#
# Let's see how we covered the training set
#
print "Performance on training set ... "
this_target = target[target['user_id'].isin(uids_train)]
this_features = features[features['user_id'].isin(uids_train)]
for clf in trees:
    print clf
    pred = clf.predict_proba(X_train)[:, 1]
    for p in [0.48, 0.49, 0.5, 0.51]: # time and time again, this value wins
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']
    
#
# Now, how do we perform out of sample?
#
print "Performance on validation set ... "
this_target = target[target['user_id'].isin(uids_val)]
this_features = features[features['user_id'].isin(uids_val)]
for clf in trees:
    pred = clf.predict_proba(X_val)[:, 1]
    for p in [0.48, 0.49, 0.5, 0.51]:
        
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']

Performance on training set ... 
RandomForestClassifier(bootstrap=True, class_weight={0: 1.0, 1: 4.0},
            criterion='gini', max_depth=12, max_features=4,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


0.48 0.385586719135
0.49 0.385678167567
0.5 0.38565647369
0.51 0.385031728105
RandomForestClassifier(bootstrap=True, class_weight={0: 1.0, 1: 4.0},
            criterion='gini', max_depth=12, max_features=6,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
0.48 0.385455232891
0.49 0.385903780716
0.5 0.385682917775
0.51 0.385122380995
RandomForestClassifier(bootstrap=True, class_weight={0: 1.0, 1: 4.0},
            criterion='gini', max_depth=15, max_features=4,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=500,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
0.48 0.391201562597
0.49 0.391000128794
0.5 0.39061954762
0.51 0.3898

In [None]:
#
# Optional. Retrain the model with the winning hyperparameters on the entire 
# training data set. I think this is always a good idea? Can it go wrong?
#   (This actually went horribly wrong, giving an *in sample* F1 score of 0.1,
#    so I'm not really sure what to make of that!)
#
clf = RandomForestClassifier(
    n_estimators=100, 
    random_state=0, 
    min_samples_split=1000,
    n_jobs=4,
    verbose=3)
X_all = np.vstack((X_train, X_val))
y_all = np.concatenate((y_train, y_val))
clf.fit(X_all, y_all)

In [None]:
# check the model hasn't changed too much

print clf.feature_importances_ 
print trees[2].feature_importances_

pred = trees[2].predict_proba(X_all)[:, 1]
uids = np.concatenate((uids_train, uids_val))
this_features = features[features['user_id'].isin(uids)]
this_features['prediction'] = pred > 0.195
this_target = target
out = this_features.groupby('user_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out = out.merge(this_target, on='user_id', how='left')

print out.head()
print f1score(out['prediction'], out['target'])
#del this_features['prediction']

In [162]:
trees[4]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=500, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=4, oob_score=False, random_state=0,
            verbose=5, warm_start=False)

In [214]:
#
# Let's make a submission!!
#

clf = trees[0]

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
pred = clf.predict_proba(X_test)[:, 1] > 0.41
this_features = features[features['user_id'].isin(uids_test)]
this_features['prediction'] = pred
this_features = this_features.merge(oid_uid_test, on='user_id', how='left')
out = this_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Do

Unnamed: 0,order_id,prediction
0,17,"{13107, 21709, 47766, 21463}"
1,34,"{47792, 2596, 44632, 39180, 43504, 21137, 1608..."
2,137,"{38689, 25890, 44422, 5134, 23794, 24852, 2326..."
3,182,"{11520, 5479, 33000, 47209, 39275, 41149, 4767..."
4,257,"{27104, 49235, 24838, 39475, 29837, 13870, 211..."


In [215]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()

In [168]:
#
#
# POST MORTEM: WHAT WENT WRONG?
#   what does the misclassified data look like?
#   what features could we add to better classify them?
#
this_features = features[features['user_id'].isin(uids_train)]
clf = trees[2]
pred = clf.predict_proba(X_train)[:, 1]
this_features['prediction'] = pred > 0.195
this_features.head()

Performance on training set ... 
   user_id  product_id  user_prod_norders  user_norders  user_prod_rate  \
0        1         196                 10            10             1.0   
1        1       10258                  9            10             0.9   
2        1       10326                  1            10             0.1   
3        1       12427                 10            10             1.0   
4        1       13032                  3            10             0.3   

   baskets_since  basket_pos  reorder_prob  order_hour_of_day  order_dow  \
0              0    1.400000      0.152625                  8          4   
1              0    3.333333      0.140036                  8          4   
2              5    5.000000      0.147166                  8          4   
3              0    3.300000      0.095295                  8          4   
4              0    6.333333      0.090202                  8          4   

   days_since_prior_order  user_reorder_rate  user_order_dt

[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   11.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.8min finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,user_prod_rate,baskets_since,basket_pos,reorder_prob,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,user_order_dt,reordered,prediction
0,1,196,10,10,1.0,0,1.4,0.152625,8,4,14.0,4.1,20.259259,1.0,True
1,1,10258,9,10,0.9,0,3.333333,0.140036,8,4,14.0,4.1,20.259259,1.0,True
2,1,10326,1,10,0.1,5,5.0,0.147166,8,4,14.0,4.1,20.259259,0.0,False
3,1,12427,10,10,1.0,0,3.3,0.095295,8,4,14.0,4.1,20.259259,0.0,True
4,1,13032,3,10,0.3,0,6.333333,0.090202,8,4,14.0,4.1,20.259259,1.0,True


In [190]:
errors = this_features.groupby(['product_id','user_id']).apply(
    lambda x: pd.Series(
        {'false_pos':sum((x['prediction'] == 1) & (x['reordered'] == 0)),
         'false_neg':sum((x['prediction'] == 0) & (x['reordered'] == 1)),
         'true_pos':sum((x['prediction'] == 1) & (x['reordered'] == 1)),
         'true_neg':sum((x['prediction'] == 0) & (x['reordered'] == 0))})
).reset_index().merge(pd.read_csv('products.csv'), on='product_id')

KeyboardInterrupt: 

In [None]:
errors.sort_values('false_neg', ascending=0)


In [187]:
clf.feature_importances_

array([ 0.07906365,  0.03530213,  0.23368996,  0.1513633 ,  0.06063451,
        0.14184713,  0.02117745,  0.04417509,  0.05227964,  0.08449257,
        0.09597457])

In [None]:
feature_list = [
    'user_prod_norders',
    'user_norders',
    'user_prod_rate',
    'baskets_since',
    'basket_pos',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate',
    'user_order_dt']