In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
from evaluate import f1score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [3]:
#
# Read in training (and validation) data
# 

print "Reading and merging orders tables ... ",

train = pd.read_csv('order_products__train.csv')
prior = pd.read_csv('order_products__prior.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

train = train.merge(
    orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')
prior = prior.merge(orders, on='order_id', how='left').merge(
    products, on='product_id', how='left')

# compute training target
target = train.groupby('user_id'
    ).apply(lambda x: set(x[x['reordered'] == True]['product_id'])
    ).reset_index(name='target')

print "done."

Reading and merging orders tables ...  done.


In [4]:
#
# Before splitting off the validation set, compute "product features".
# These are features that depend only on product_id; their values are the
# same for training, validation and testing.
#

print "Computing product features ... ",

#   number of users who ever had this product in their basket
x = prior.groupby('product_id').apply(
    lambda x: len(set(x['user_id']))).reset_index(name='nusers')

# number of users that reorderd the item
y = train.groupby('product_id').apply(
    lambda x: x['reordered'].sum()).reset_index(name='reordered')

reorder_prob = x.merge(y, on='product_id', how='left').fillna(0)
reorder_prob['reorder_prob'] = reorder_prob['reordered'] / reorder_prob['nusers']
del x, y, reorder_prob['reordered'], reorder_prob['nusers']

print "done."

Computing product features ...  done.


In [5]:
# Generate np array of features ...
#
# We will use as features:
#

print "Computing user / user-product features ... "

prior_gb_up = prior.groupby(['user_id', 'product_id'])

#   1: Number of times product has been ordered by user
print "\tnorders of product by user ... "
user_prod_norders = prior_gb_up.apply(len).reset_index(name='user_prod_norders')

#   2: Total number of orders by user
print "\tnorders by user"
user_norders = prior.groupby('user_id'
    ).apply(lambda x: len(set(x['order_id']))).reset_index(name='user_norders')
# FIXME is there a way to reuse prior_gb_up here? 

#   3: Number of baskets since user last ordered this item
print "\tbaskets since last order of product ..."
baskets_since = prior_gb_up.apply(
    lambda x: max(x['order_number'])).reset_index(name='last_basket')

baskets_since = baskets_since.merge(user_norders, on='user_id', how='left')
baskets_since['basktets_since'] = baskets_since['user_norders'] - baskets_since['last_basket']
del baskets_since['user_norders'], baskets_since['last_basket']
del prior_gb_up

#   4: Number of times product has been ordered / number of users that
#      that have ordered the product
print "\thow often the product appears in the same user's basket ...",
reorder_rate = prior.groupby('product_id').apply(
    lambda x: float(len(x)) / float(len(set(x['user_id'])))
    ).reset_index(name='reorder_rate')

print "done."

Computing user / user-product features ... 
	norders of product by user ... 
	norders by user
	baskets since last order of product ...
	how often the product appears in the same user's basket ... done.


In [6]:
#
# Assemble all features together in one table
#
features = user_prod_norders.merge(
    user_norders, on='user_id', how='left'
    ).merge(baskets_since, on=['user_id', 'product_id'], how='left'
    ).merge(reorder_rate, on='product_id', how='left'
    ).merge(reorder_prob, on='product_id', how='left')
del user_norders, baskets_since, reorder_rate
# need to keep "product features" for submission stage

# Add training data, i.e., whether product was reordered by user
features = features.merge(
    train[['user_id','product_id','reordered']],
    on=['user_id','product_id'], how='left').fillna(0)

features.head(15)

Unnamed: 0,user_id,product_id,user_prod_norders,user_norders,basktets_since,reorder_rate,reorder_prob,reordered
0,1,196,10,10,0,4.473875,0.152625,1.0
1,1,10258,9,10,0,3.493716,0.140036,1.0
2,1,10326,1,10,5,2.873635,0.147166,0.0
3,1,12427,10,10,0,3.857058,0.095295,0.0
4,1,13032,3,10,0,2.916796,0.090202,1.0
5,1,13176,2,10,5,5.972111,0.210303,0.0
6,1,14084,1,10,9,5.290505,0.148406,0.0
7,1,17122,1,10,5,3.08239,0.104819,0.0
8,1,25133,8,10,0,3.848447,0.086335,1.0
9,1,26088,2,10,8,2.16939,0.066208,1.0


In [9]:
#
# Establish separate training and validation data
#

Nval = 50000 # seems like a good number
uids = train['user_id'].unique()
uids_train = uids[:-Nval]
uids_val = uids[-Nval:]

#
# training
#
where = features['user_id'].isin(uids_train)
X_train = features[where][[
    'user_prod_norders',
    'user_norders',
    'basktets_since',
    'reorder_rate',
    'reorder_prob']].as_matrix()
y_train = features[where]['reordered'].as_matrix()

#
# validation
#
where = features['user_id'].isin(uids_val)
X_val = features[where][[
    'user_prod_norders',
    'user_norders',
    'basktets_since',
    'reorder_rate',
    'reorder_prob']].as_matrix()
y_val = features[where]['reordered'].as_matrix()

#
# test / submission
#
uids_test = orders[orders['eval_set'] == 'test']['user_id'].unique()
where = features['user_id'].isin(uids_test)
X_test = features[where][[
    'user_prod_norders',
    'user_norders',
    'basktets_since',
    'reorder_rate',
    'reorder_prob']].as_matrix()

print len(uids_val), len(uids_train), len(uids_test), len(orders['user_id'].unique())

50000 81209 75000 206209


In [18]:
#
# Train the decision tree model(s)
#

trees = []
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)
trees.append(clf)
    #export_graphviz(clf, out_file='tree.dot')


In [19]:
#
# Let's see how we covered the training set
#
print "Performance on training set ... "
this_target = target[target['user_id'].isin(uids_train)]
this_features = features[features['user_id'].isin(uids_train)]
for clf in trees:
    pred = clf.predict_proba(X_train)[:, 1]
    for p in [0.195]: # time and time again, this value wins
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']
    
#
# Now, how do we perform out of sample?
#
print "Performance on validation set ... "
this_target = target[target['user_id'].isin(uids_val)]
this_features = features[features['user_id'].isin(uids_val)]
for clf in trees:
    pred = clf.predict_proba(X_val)[:, 1]
    for p in [0.195]:
        
        this_features['prediction'] = pred > p
        out = this_features.groupby('user_id').apply(
            lambda x: set(x[x['prediction'] == True]['product_id'])
            ).reset_index(name='prediction')

        out = out.merge(this_target, on='user_id', how='left')
        print p, f1score(out['prediction'], out['target'])
        del this_features['prediction']

Performance on training set ... 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


0.195 0.215111613238
Performance on validation set ... 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.195 0.215869912678


In [15]:
#
# Let's make a submission!!
#
clf = trees[2]

#
# generate predictions for test set from model
#
oid_uid_test = orders[orders['eval_set'] == 'test'][['order_id', 'user_id']]
pred = clf.predict_proba(X_test)[:, 1] > 0.195
this_features = features[features['user_id'].isin(uids_test)]
this_features['prediction'] = pred
this_features = this_features.merge(oid_uid_test, on='user_id', how='left')
out = this_features.groupby('order_id').apply(
    lambda x: set(x[x['prediction'] == True]['product_id'])
    ).reset_index(name='prediction')

out.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,order_id,prediction
0,17,"{47766, 13107, 21709, 39275, 21463}"
1,34,"{47792, 44632, 39180, 43504, 21137, 16083, 477..."
2,137,"{38689, 25890, 44422, 5134, 23794, 24852, 2326..."
3,182,"{11520, 5479, 33000, 47209, 39275, 41149, 4767..."
4,257,"{27104, 49235, 24838, 39475, 29837, 13870, 211..."


In [16]:
#
# Write predictions to disk for submission.
#

fd = open('submission.csv', 'w')
fd.write('order_id,products\n')

for oid, pr in zip(out['order_id'], out['prediction']):
    fd.write('%d,' % oid)

    if pr:
        fd.write(' '.join(map(str, pr)))
    else:
        fd.write('None')
    fd.write('\n')

fd.close()