In [125]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [45]:
#
# Read in training (and validation) data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
prior = prior.merge(orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# compute training target
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')

print "done."

Reading and merging orders tables ...  done.


In [47]:
#
# Before splitting off the validation set, compute "product features".
# These are features that depend only on product_id; their values are the
# same for training, validation and testing.
#

print "Computing product features ... ",

#   number of users who ever had this product in their basket
x = prior.groupby('product_id').apply(
    lambda x: len(set(x['user_id']))).reset_index(name='nusers')

# number of users that reorderd the item
y = train.groupby('product_id').apply(
    lambda x: x['reordered'].sum()).reset_index(name='reordered')

reorder_prob = x.merge(y, on='product_id', how='left').fillna(0)
reorder_prob['reorder_prob'] = reorder_prob['reordered'] / reorder_prob['nusers']
del x, y, reorder_prob['reordered'], reorder_prob['nusers']

#
# FIXME: try inter-purchase time?
#

print "done."

Computing product features ...  done.


In [111]:
#
# Generate features ...
#

print "Computing user / user-product features ... "

prior_gb_up = prior.groupby(['user_id', 'product_id'])
prior_gb_u = prior.groupby('user_id')
# FIXME is there a way to reuse prior_gb_up here? 

Computing user / user-product features ... 


In [110]:
#   1: Number of times product has been ordered by user
print "\tnorders of product by user ... ",
user_prod_norders = prior_gb_up.apply(len).reset_index(name='user_prod_norders')
print "done."

 Computing user / user-product features ... 
	norders of product by user ...  done.


In [108]:
#   2: Total number of orders by user
print "\tnorders by user ... ",
user_norders = prior_gb_u.apply(
    lambda x: len(set(x['order_id']))).reset_index(name='user_norders')
# FIXME you can also get this from orders table ... probably quicker
print "done."

	norders by user ...  done.


In [112]:
#   3: Number of baskets since user last ordered this item
print "\tbaskets since last order of product ... ",
baskets_since = prior_gb_up.apply(
    lambda x: max(x['order_number'])).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['baskets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']
print "done."

	baskets since last order of product ...  done.


In [101]:
#   4: Mean basket position. Lower basket position means customer likes product?
print "\tmean basket position ... ",
baskets_pos = prior_gb_up.apply(
    lambda x: x['add_to_cart_order'].mean()).reset_index(name='basket_pos')
print "done."

	mean basket position ...  done.


In [102]:
#   5/6/7: Time of day / dow / days since prior order
print "\torder features ... ",
order_features = orders[orders['eval_set'] != 'prior'][[
    'user_id',
    'order_hour_of_day',
    'order_dow',
    'days_since_prior_order']]

assert len(order_features) == len(orders['user_id'].unique())
del prior_gb_up
print "done."

	order features ...  done.


In [103]:
#    8: Does user reorder in general?
print "\tuser reorder rate ... ",
user_reorder_rate = prior_gb_u.apply(
    lambda x: x['reordered'].sum() / 1.0 / len(set(x['order_id']))
    ).reset_index(name='user_reorder_rate')

#   4: Number of times product has been ordered / number of users that
#      that have ordered the product
#print "\thow often the product appears in the same user's basket ...",
#reorder_rate = prior.groupby('product_id').apply(
#    lambda x: float(len(x)) / float(len(set(x['user_id'])))
#    ).reset_index(name='reorder_rate')

print "done."

	user reorder rate ...  done.


In [115]:
#
# Assemble all features together in one table
#
features = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(reorder_prob, on='product_id', how='left'
    ).merge(order_features, on='user_id', how='left'
    ).merge(user_reorder_rate, on='user_id', how='left')
# need to keep "product features" for submission stage

# Add training data, i.e., whether product was reordered by user
features = features.merge(
    train[['user_id','product_id','reordered']],
    on=['user_id','product_id'], how='left').fillna(0)

features.head(15)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,baskets_since,reorder_prob,order_hour_of_day,order_dow,days_since_prior_order,user_reorder_rate,reordered
0,1,196,10,10,0,0.152625,8,4,14.0,4.1,1.0
1,1,10258,9,10,0,0.140036,8,4,14.0,4.1,1.0
2,1,10326,1,10,5,0.147166,8,4,14.0,4.1,0.0
3,1,12427,10,10,0,0.095295,8,4,14.0,4.1,0.0
4,1,13032,3,10,0,0.090202,8,4,14.0,4.1,1.0
5,1,13176,2,10,5,0.210303,8,4,14.0,4.1,0.0
6,1,14084,1,10,9,0.148406,8,4,14.0,4.1,0.0
7,1,17122,1,10,5,0.104819,8,4,14.0,4.1,0.0
8,1,25133,8,10,0,0.086335,8,4,14.0,4.1,1.0
9,1,26088,2,10,8,0.066208,8,4,14.0,4.1,1.0


In [117]:
#
# Establish separate training and validation data
#

Nval = 50000 # seems like a good number
uids = train['user_id'].unique()
uids_train = uids[:-Nval]
uids_val = uids[-Nval:]

#
# training
#
where = features['user_id'].isin(uids_train)
X_train = features[where][[
    'user_prod_norders',
    'user_norders',
    'baskets_since',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate']].as_matrix()
# FIXME: features should contain exactly the columns you need
y_train = features[where]['reordered'].as_matrix()

#
# validation
#
where = features['user_id'].isin(uids_val)
X_val = features[where][[
    'user_prod_norders',
    'user_norders',
    'baskets_since',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate']].as_matrix()
y_val = features[where]['reordered'].as_matrix()

#
# test / submission
#
uids_test = orders[orders['eval_set'] == 'test']['user_id'].unique()
where = features['user_id'].isin(uids_test)
X_test = features[where][[
    'user_prod_norders',
    'user_norders',
    'baskets_since',
    'reorder_prob',
    'order_dow',
    'order_hour_of_day',
    'days_since_prior_order',
    'user_reorder_rate']].as_matrix()

print len(uids_val), len(uids_train), len(uids_test), len(orders['user_id'].unique())

50000 81209 75000 206209


In [127]:
#
# Train the decision tree model(s)
#

trees = []
for mss in [5, 50, 500, 5000]:
    clf = RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split=mss, max_depth=md)
    clf.fit(X_train, y_train)
    trees.append(clf)
    #export_graphviz(clf, out_file='tree.dot')
    #print clf.feature_importances_

In [128]:
#
# Let's see how we covered the training set
#
print "Performance on training set ... "
this_target = target[target['user_id'].isin(uids_train)]
this_features = features[features['user_id'].isin(uids_train)]
for clf in trees:
    pred = clf.predict_proba(X_train)[:, 1]
    for p in [0.195]: # time and time again, this value wins
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']
    
#
# Now, how do we perform out of sample?
#
print "Performance on validation set ... "
this_target = target[target['user_id'].isin(uids_val)]
this_features = features[features['user_id'].isin(uids_val)]
for clf in trees:
    pred = clf.predict_proba(X_val)[:, 1]
    for p in [0.195]:
        
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']

Performance on training set ... 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


0.195 0.386698245983
0.195 0.38614322631
0.195 0.3842598784
0.195 0.380578694124
Performance on validation set ... 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.195 0.380932125186
0.195 0.381393967778
0.195 0.380982490375
0.195 0.380449195775


In [129]:
#
# Let's make a submission!!
#
clf = trees[1]

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
pred = clf.predict_proba(X_test)[:, 1] > 0.195
this_features = features[features['user_id'].isin(uids_test)]
this_features['prediction'] = pred
this_features = this_features.merge(oid_uid_test, on='user_id', how='left')
out = this_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,order_id,prediction
0,17,"{13107, 21463}"
1,34,"{47792, 2596, 44632, 39180, 43504, 21137, 1608..."
2,137,"{38689, 25890, 5134, 23794, 24852, 2326, 29594..."
3,182,"{11520, 5479, 33000, 10025, 39275, 41149, 4720..."
4,257,"{27104, 49235, 24838, 29837, 13870, 21137, 394..."


In [130]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()

In [None]:
#
# How much do predictions change if we retrain the winning tree on the whole
# training set??
#
clf = RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split=500, max_depth=md)
X_all = np.vstack((X_train, X_val))
y_all = np.concatenate((y_train, y_val))
clf.fit(X_all, y_all)

In [94]:
pred = clf.predict_proba(X_all)[:, 1]
this_features = features[features['user_id'].isin(uids_train) & features['user_id'].isin(uids_val)]
this_features['prediction'] = pred > 0.195
out = this_features.groupby('user_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out = out.merge(this_target, on='user_id', how='left')
print f1score(out['prediction'], out['target'])
del this_features['prediction']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


TypeError: reset_index() got an unexpected keyword argument 'name'