In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
#
# Read in and down-select training data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

# sample just a subset of training set for speed
N = 50000
oids = train['order_id'].unique()[:N]
train = train[train['order_id'].isin(oids)]
train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# take only prior data corresponding to users left in training set
uids = train['user_id'].unique()
prior = prior.merge(orders, on='order_id', how='left')
prior = prior[prior['user_id'].isin(uids)]
prior = prior.merge(products, on='product_id', how='left')

print "done."

Reading and merging orders tables ...  done.


In [3]:
# Generate np array of features ...
#
# Taking motivation from this kernel:
# https://www.kaggle.com/fabienvs/instacart-xgboost-starter-lb-0-3791/code
# We will use as features:
#
#   1: Number of times product has been ordered by user
user_prod_norders = prior.groupby(['user_id', 'product_id']
    ).apply(len).reset_index(name='user_prod_norders')

#   2: Total number of orders by user
user_norders = prior.groupby('user_id'
    ).apply(lambda x: len(set(x['order_id']))).reset_index(name='user_norders')

#   3: Number of baskets since user last ordered this item
baskets_since = prior.groupby(['user_id', 'product_id']
    ).apply(lambda x: max(x['order_number'])
    ).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['basktets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']

#   4: Number of times product has been ordered / number of users that
#      that have ordered the product
reorder_rate = prior.groupby('product_id').apply(
    lambda x: float(len(x)) / float(len(set(x['user_id'])))
    ).reset_index(name='reorder_rate')

training_data = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(reorder_rate, on='product_id', how='left')

# Add training data, i.e., whether product was reordered by user
training_data = training_data.merge(
    train[['user_id','product_id','reordered']], on=['user_id','product_id'], how='left').fillna(0)

training_data.head(15)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,basktets_since,reorder_rate,reordered
0,1,196,10,10,0,4.717192,1.0
1,1,10258,9,10,0,3.25,1.0
2,1,10326,1,10,5,2.938034,0.0
3,1,12427,10,10,0,4.179104,0.0
4,1,13032,3,10,0,2.845481,1.0
5,1,13176,2,10,5,5.933454,0.0
6,1,14084,1,10,9,5.200528,0.0
7,1,17122,1,10,5,3.184484,0.0
8,1,25133,8,10,0,3.719895,1.0
9,1,26088,2,10,8,2.350181,1.0


In [6]:
# input / target
X = training_data[['user_prod_norders','user_norders','basktets_since','reorder_rate']].as_matrix()
y = training_data['reordered'].as_matrix()

clf = DecisionTreeClassifier(random_state=0, min_samples_split=10000, max_depth=10)
clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10000, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [8]:
pred = clf.predict_proba(X)[:, 1]
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')

for p in np.linspace(0.17, 0.21, 15):
    training_data['prediction'] = pred > p
    out = training_data.groupby('user_id').apply(
        lambda x: set(x[x['prediction'] == True]['product_id'])
        ).reset_index(name='prediction')

    out = out.merge(target, on='user_id', how='left')
    print p, f1score(out['prediction'], out['target'])

0.17 0.370158535267
0.172857142857 0.370550031905
0.175714285714 0.370550031905
0.178571428571 0.370754182251
0.181428571429 0.370754182251
0.184285714286 0.37075314554
0.187142857143 0.371138112535
0.19 0.371373395781
0.192857142857 0.372230226219
0.195714285714 0.372230226219
0.198571428571 0.37249573149
0.201428571429 0.37249573149
0.204285714286 0.372317902245


KeyboardInterrupt: 

In [9]:
#
# Let's make a submission!!
#

#
# Load prior data from the "test" set
#
print "Reading prior orders data from test set ... ",
prior = pd.read_csv('order_products__prior.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'test'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
prior = prior.merge(orders, on='order_id', how='left')

# down-select
prior = prior[prior['user_id'].isin(oid_uid_test['user_id'])]

# Generate np array of features ...
#
# Taking motivation from this kernel:
# https://www.kaggle.com/fabienvs/instacart-xgboost-starter-lb-0-3791/code
# We will use as features:
#
#   1: Number of times product has been ordered by user
user_prod_norders = prior.groupby(['user_id', 'product_id']
    ).apply(len).reset_index(name='user_prod_norders')

#   2: Total number of orders by user
user_norders = prior.groupby('user_id'
    ).apply(lambda x: len(set(x['order_id']))).reset_index(name='user_norders')

#   3: Number of baskets since user last ordered this item
baskets_since = prior.groupby(['user_id', 'product_id']
    ).apply(lambda x: max(x['order_number'])
    ).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['basktets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']

#   4: Number of times product has been ordered / number of users that
#      that have ordered the product
reorder_rate = prior.groupby('product_id').apply(
    lambda x: float(len(x)) / float(len(set(x['user_id'])))
    ).reset_index(name='reorder_rate')



Reading prior orders data from test set ...  done.


In [10]:
#
# generate predictions for test set from model
#

eval_data = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(reorder_rate, on='product_id', how='left')

X = eval_data[['user_prod_norders','user_norders','basktets_since','reorder_rate']].as_matrix()
pred = clf.predict_proba(X)[:, 1] > 0.2
eval_data['prediction'] = pred
eval_data.head()
eval_data = eval_data.merge(oid_uid_test, on='user_id', how='left')
eval_data = eval_data.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])).reset_index(name='prediction')

eval_data.head()

Unnamed: 0,order_id,prediction
0,17,"{38777, 13107, 21463}"
1,34,"{47792, 2596, 39180, 43504, 21137, 16083, 4776..."
2,137,"{38689, 25890, 44422, 5134, 23794, 24852, 2326..."
3,182,"{5479, 33000, 47209, 39275, 47672, 9337, 13629}"
4,257,"{27104, 1025, 49235, 11140, 39475, 29837, 1387..."


In [11]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(eval_data['order_id'], eval_data['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()

In [12]:
export_graphviz(clf, out_file='tree.dot')