In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter

In [2]:
#
#
# This script takes a stab at establishing a baseline prediction. We will make the
# most naive model we can think of, and then systematically tear apart its underlying
# assumptions until we have a model that actually works ;).
#
# The idea is simple: given a customer, predict that his/her next basket will contain
# all products which ... 
#  (i) appear in > 80% of his/her previous baskets, or
#  (ii) appear more than once in his/her previous baskets, or
#  (iii) appear at all in any previous baskets
#
# We will test these three ideas on the training set, and submit the one which gives 
# the best F1 score.
#
#

In [3]:
#
# Load prior data from the "train" set
#

print "Reading prior orders data from train set ... ",
order_prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'train'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
test = order_prior.merge(orders, on='order_id', how='left')

# down-select
test = test[test['user_id'].isin(oid_uid_test['user_id'])]

# free memory
del order_prior

Reading prior orders data from train set ...  done.


In [4]:
#
# Load the training data for evaluating the predictive models
#

# Load the training data mapping order_id to the products ordered
def products_observed(x):
    # The "correct" answer: the set of all products that were previously ordered
    # and appear in the customer's next basket.
    # NB: there should only be one order per user in the training set and set() vs
    # list() shouldn't matter. This is a worth a sanity check CHECKME.
    return pd.Series({'products':set(x[x['reordered'] == 1]['product_id'])})

train = pd.read_csv('order_products__train.csv').merge(orders, on='order_id', how='left')
train = train.groupby('user_id').apply(products_observed)
print train.head()

from evaluate import f1score

                                                  products
user_id                                                   
1        {196, 26405, 13032, 39657, 25133, 38928, 26088...
2        {41787, 33957, 22825, 45066, 16589, 45613, 229...
5                             {21616, 40706, 15349, 21413}
7        {29894, 17638, 47272, 45066, 13198, 37999, 408...
8                             {15937, 41540, 23165, 21903}


In [5]:
# some heuristic models for making predictions of future reorders
# no ML yet!! just establishing a baseline
def products1(x):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    #return pd.Series({'products':C, 'norders':norders})
    return pd.Series({'products':set([p for p in C if C[p] / float(norders) > 0.8])})

def products2(x):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    #return pd.Series({'products':C, 'norders':norders})
    return pd.Series({'products':set([p for p in C if C[p] > 1])})

def products3(x):
    C = Counter(x['product_id'])
    norders = len(set(x['order_id']))
    #return pd.Series({'products':C, 'norders':norders})
    return pd.Series({'products':set([p for p in C if C[p] > 0])})

for heuristic in [products1, products2, products3]:
    product_counts = test.groupby(['user_id']).apply(heuristic)
    print f1score(product_counts['products'], train['products'])

0.14699473447
0.302637573062
0.215400579483


In [8]:
#
# Load prior data from the "test" set
#

print "Reading prior orders data from test set ... ",
order_prior = pd.read_csv('order_products__prior.csv')
print "done."

# down-select to only those appearing in the training set to save memory
eval_set = 'test'

# get the order_id / user_id pairs for the given evaluation set
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

# add user_id information to order_prior
test = order_prior.merge(orders, on='order_id', how='left')

# down-select
test = test[test['user_id'].isin(oid_uid_test['user_id'])]

# free memory
del order_prior

Reading prior orders data from test set ...  done.


In [10]:
#
# Write out the winning model to disk for submission
#

# Write out a submission file.
winner = products2
oid_uid_test = orders[orders['eval_set'] == eval_set][['order_id', 'user_id']]

#oid_uid_test.set_index('user_id')
#product_counts = test.groupby(['user_id']).apply(winner)

print oid_uid_test.head()
print product_counts.head()

oid_uid_test = oid_uid_test.set_index(['user_id'])
product_counts = product_counts.set_index(['user_id'])
product_counts2 = oid_uid_test.merge(product_counts, on='user_id', how='left')

print oid_uid_test.head()
print product_counts.head()

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(product_counts['order_id'], product_counts['products']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()

     order_id  user_id
38    2774568        3
44     329954        4
53    1528013        6
96    1376945       11
102   1356845       12
                                                  products
user_id                                                   
3        {23650, 9387, 17668, 47766, 18599, 24010, 4468...
4                                                  {35469}
6                                           {38293, 21903}
11       {34658, 33731, 33572, 16869, 36070, 38456, 446...
12       {21616, 7076, 28134, 8239, 14992, 5746, 13176,...


KeyError: 'user_id'