In [99]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score

In [None]:
#
#
# This script builds on naive_baseline.ipynb by trying to include product category
# information.
#
# The upshot is that this didn't show much promise, and I decided to abandon this
# approach (for now) in favor of a real sklearn decision tree. 
#
# See dt_sklearn_4features.ipynb.
#
#

In [40]:
#
# Load prior data from the "train" set
#

print "Reading prior orders data from train set ... ",
order_prior = pd.read_csv('../order_products__prior.csv')
orders = pd.read_csv('../orders.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'train'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
test = order_prior.merge(orders, on='order_id', how='left')

# down-select
test = test[test['user_id'].isin(oid_uid_test['user_id'])]

# free memory
del order_prior
test.head(100)

Reading prior orders data from train set ...  done.


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0
5,2,17794,6,1,202279,prior,3,5,9,8.0
6,2,40141,7,1,202279,prior,3,5,9,8.0
7,2,1819,8,1,202279,prior,3,5,9,8.0
8,2,43668,9,0,202279,prior,3,5,9,8.0
9,3,33754,1,1,205970,prior,16,5,17,12.0


In [39]:
#
# Load the training data for evaluating the predictive models
#

# Load the training data mapping order_id to the products ordered
def products_reordered(x):
    # The "correct" answer: the set of all products that were previously ordered
    # and appear in the customer's next basket.
    # NB: there should only be one order per user in the training set and set() vs
    # list() shouldn't matter. This is a worth a sanity check CHECKME.
    return set(x[x['reordered'] == 1]['product_id'])

train = pd.read_csv('../order_products__train.csv').merge(orders, on='order_id', how='left')
train = train.groupby('user_id').apply(products_reordered)
train = train.reset_index(name='products_reordered')
train.head(100)

Unnamed: 0,user_id,products_reordered
0,1,"{196, 26405, 13032, 39657, 25133, 38928, 26088..."
1,2,"{41787, 33957, 22825, 45066, 16589, 45613, 229..."
2,5,"{21616, 40706, 15349, 21413}"
3,7,"{29894, 17638, 47272, 45066, 13198, 37999, 408..."
4,8,"{15937, 41540, 23165, 21903}"
5,9,"{8834, 8467, 10132, 1559, 29594, 27555, 12075,..."
6,10,{}
7,13,"{4210, 27435, 19934, 27086}"
8,14,"{3808, 15172, 29509, 8744, 32115, 28601, 37434..."
9,17,"{12720, 16797, 18534}"


In [23]:
# some heuristic models for making predictions of future reorders
# no ML yet!! just establishing a baseline
def products_p(x, prob):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    return set([p for p in C if C[p] / float(norders) > prob])

def products_n(x, n):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    return set([p for p in C if C[p] > n])

for p in [0.5, 0.8]:
    print "p model", p, "...",
    heuristic = lambda x: products_p(x, p)
    predict = test.groupby(['user_id']).apply(heuristic)
    predict = predict.reset_index(name='products_predicted')
    print "F1:", f1score(predict['products_predicted'], train['products_reordered'])
    
for n in [0, 1, 2, 3]:
    print "n model", n, "...",
    heuristic = lambda x: products_n(x, n)
    predict = test.groupby(['user_id']).apply(heuristic)
    predict = predict.reset_index(name='products_predicted')
    print "F1:", f1score(predict['products_predicted'], train['products_reordered'])

 p model 0.5 ... F1: 0.265367442142
p model 0.8 ... F1: 0.14699473447
n model 0 ... F1: 0.215400579483
n model 1 ... F1: 0.302637573062
n model 2 ... F1: 0.307927178382
n model 3 ... F1: 0.276981840147


In [25]:
#
# Load prior data from the "test" set
#

print "Reading prior orders data from test set ... ",
order_prior = pd.read_csv('../order_products__prior.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'test'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
test = order_prior.merge(orders, on='order_id', how='left')

# down-select
test = test[test['user_id'].isin(oid_uid_test['user_id'])]

# free memory
del order_prior
test.head(25)

 Reading prior orders data from test set ...  done.


In [30]:
#
# Write out the winning model to disk for submission
#

# Write out a submission file.
winner = lambda x: products_n(x, 2)
predict = test.groupby(['user_id']).apply(winner).reset_index(name='products_predicted')
predict = predict.merge(oid_uid_test, on='user_id')

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(product_counts['order_id'], predict['products_predicted']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()
predict.head()

Unnamed: 0,user_id,products_predicted,order_id
0,3,"{17668, 39190, 18599, 24810, 9387, 21903, 3240...",2774568
1,4,{},329954
2,6,{},1528013
3,11,"{34658, 14947, 35948, 28465, 8309, 27959, 3564...",1376945
4,12,"{14992, 13176, 7076, 10863}",1356845


In [98]:
from evaluate import f1score_global
N = 5000 # sample just a subset of training set for speed
train = pd.read_csv('../order_products__train.csv')
oids = train['order_id'].unique()[:N]
train = train[train['order_id'].isin(oids)]
train = train.merge(orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
uids = train['user_id'].unique()

print "Reading prior orders data from test set ... ",
prior = pd.read_csv('../order_products__prior.csv').merge(orders, on='order_id', how='left')
prior = prior[prior['user_id'].isin(uids)]
print "done."

products = pd.read_csv('../products.csv')
dids = products['department_id'].unique()
test = prior.merge(products, on='product_id', how='left')

# Load the training data mapping order_id to the products ordered
def products_reordered(x, did):
    # The "correct" answer: the set of all products that were previously ordered
    # and appear in the customer's next basket.
    # NB: there should only be one order per user in the training set and set() vs
    # list() shouldn't matter. This is a worth a sanity check CHECKME.
    return set(x[(x['reordered'] == 1) & (x['department_id']==did)]['product_id'])

def products_n(x, n, did):
    C = Counter(x[x['department_id']==did]['product_id'])
    norders = len(set(x[x['department_id']==did]['order_id']))
    return set([p for p in C if C[p] > n])

def products_p(x, prob):
    C = Counter(x[x['department_id']==did]['product_id'])
    norders = len(set(x[x['department_id']==did]['order_id']))
    return set([p for p in C if C[p] / float(norders) > prob])

for did in dids:
    print "DID:", did

    # need to filter training set down to only those users that ever
    # orderd in this category before; otherwise, you get a large number
    # of entries that predict {}, which trivially == the observed set
    # of reorders in that category, giving an F1 score of 1 and massively
    # biasing the estimate of how well the model is doing in this category
    uids = test.groupby('user_id').apply(lambda x: len(x[x['department_id'] == did]) > 0)
    uids = uids[uids].index.values
    test_d = test[test['user_id'].isin(uids)]
    train_d = train[train['user_id'].isin(uids)]
    #assert len(train_d) == len(uids)
    
    train_d = train_d.groupby('user_id').apply(lambda x: products_reordered(x, did))
    train_d = train_d.reset_index(name='products_reordered')

    print len(train_d), "samples"
    
    for n in [0, 1, 2, 3, 4]:
        print "n model", n, "...",
        heuristic = lambda x: products_n(x, n, did)
        predict = test_d.groupby(['user_id']).apply(heuristic)
        predict = predict.reset_index(name='products_predicted')
        print "\tF1:", f1score_global(predict['products_predicted'], train_d['products_reordered'])
    print



 Reading prior orders data from test set ...  done.
DID: 19
4165 samples
n model 0 ... 	F1: 0.161179208689
n model 1 ... 	F1: 0.25489923558
n model 2 ... 	F1: 0.2827758155
n model 3 ... 	F1: 0.283582089552
n model 4 ... 	F1: 0.270661504797

DID: 13
4163 samples
n model 0 ... 	F1: 0.0687307505268
n model 1 ... 	F1: 0.148718596137
n model 2 ... 	F1: 0.18922852984
n model 3 ... 	F1: 0.195461200586
n model 4 ... 	F1: 0.200662565073

DID: 7
4182 samples
n model 0 ... 	F1: 0.222531958106
n model 1 ... 	F1: 0.338025902984
n model 2 ... 	F1: 0.375452079566
n model 3 ... 	F1: 0.37483990323
n model 4 ... 	F1: 0.361023947151

DID: 1
3919 samples
n model 0 ... 	F1: 0.158183868738
n model 1 ... 	F1: 0.250149202831
n model 2 ... 	F1: 0.277180406213
n model 3 ... 	F1: 0.273822174991
n model 4 ... 	F1: 0.258453586044

DID: 11
2299 samples
n model 0 ... 	F1: 0.0722122990559
n model 1 ... 	F1: 0.146981627297
n model 2 ... 	F1: 0.181626187962
n model 3 ... 	F1: 0.200956937799
n model 4 ... 	F1: 0.1951219