In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score

In [None]:
#
#
# This script takes a stab at establishing a baseline prediction. We will make the
# most naive model we can think of, and then systematically tear apart its underlying
# assumptions until we have a model that actually works ;).
#
# The idea is simple: given a customer, predict that his/her next basket will contain
# all products which ... 
#  (i) appear in > 80% of his/her previous baskets, or
#  (ii) appear more than once in his/her previous baskets, or
#  (iii) appear at all in any previous baskets
#
# We will test these three ideas on the training set, and submit the one which gives 
# the best F1 score.
#
#

In [9]:
#
# Load prior data from the "train" set
#

print "Reading prior orders data from train set ... ",
order_prior = pd.read_csv('../order_products__prior.csv')
orders = pd.read_csv('../orders.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'train'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
test = order_prior.merge(orders, on='order_id', how='left')

# down-select
test = test[test['user_id'].isin(oid_uid_test['user_id'])]

# free memory
del order_prior

 Reading prior orders data from train set ...  done.


In [18]:
#
# Load the training data for evaluating the predictive models
#

# Load the training data mapping order_id to the products ordered
def products_reordered(x):
    # The "correct" answer: the set of all products that were previously ordered
    # and appear in the customer's next basket.
    # NB: there should only be one order per user in the training set and set() vs
    # list() shouldn't matter. This is a worth a sanity check CHECKME.
    return set(x[x['reordered'] == 1]['product_id'])

train = pd.read_csv('../order_products__train.csv').merge(orders, on='order_id', how='left')
train = train.groupby('user_id').apply(products_reordered)
train = train.reset_index(name='products_reordered')
train.head()

Unnamed: 0,user_id,products_reordered
0,1,"{196, 26405, 13032, 39657, 25133, 38928, 26088..."
1,2,"{41787, 33957, 22825, 45066, 16589, 45613, 229..."
2,5,"{21616, 40706, 15349, 21413}"
3,7,"{29894, 17638, 47272, 45066, 13198, 37999, 408..."
4,8,"{15937, 41540, 23165, 21903}"


In [23]:
# some heuristic models for making predictions of future reorders
# no ML yet!! just establishing a baseline
def products_p(x, prob):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    return set([p for p in C if C[p] / float(norders) > prob])

def products_n(x, n):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    return set([p for p in C if C[p] > n])

for p in [0.5, 0.8]:
    print "p model", p, "...",
    heuristic = lambda x: products_p(x, p)
    predict = test.groupby(['user_id']).apply(heuristic)
    predict = predict.reset_index(name='products_predicted')
    print "F1:", f1score(predict['products_predicted'], train['products_reordered'])
    
for n in [0, 1, 2, 3]:
    print "n model", n, "...",
    heuristic = lambda x: products_n(x, n)
    predict = test.groupby(['user_id']).apply(heuristic)
    predict = predict.reset_index(name='products_predicted')
    print "F1:", f1score(predict['products_predicted'], train['products_reordered'])

 p model 0.5 ... F1: 0.265367442142
p model 0.8 ... F1: 0.14699473447
n model 0 ... F1: 0.215400579483
n model 1 ... F1: 0.302637573062
n model 2 ... F1: 0.307927178382
n model 3 ... F1: 0.276981840147


In [25]:
#
# Load prior data from the "test" set
#

print "Reading prior orders data from test set ... ",
order_prior = pd.read_csv('../order_products__prior.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'test'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
test = order_prior.merge(orders, on='order_id', how='left')

# down-select
test = test[test['user_id'].isin(oid_uid_test['user_id'])]

# free memory
del order_prior

 Reading prior orders data from test set ...  done.


In [30]:
#
# Write out the winning model to disk for submission
#

# Write out a submission file.
winner = lambda x: products_n(x, 2)
predict = test.groupby(['user_id']).apply(winner).reset_index(name='products_predicted')
predict = predict.merge(oid_uid_test, on='user_id')

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(product_counts['order_id'], predict['products_predicted']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()
predict.head()

Unnamed: 0,user_id,products_predicted,order_id
0,3,"{17668, 39190, 18599, 24810, 9387, 21903, 3240...",2774568
1,4,{},329954
2,6,{},1528013
3,11,"{34658, 14947, 35948, 28465, 8309, 27959, 3564...",1376945
4,12,"{14992, 13176, 7076, 10863}",1356845
