### forked from https://www.kaggle.com/znielsen/test-instacart-analysis/code

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk # text preprocessing & manipulation
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting

from functools import partial # to reduce df memory consumption by applying to_numeric

color = sns.color_palette() # adjusting plotting style
import warnings
warnings.filterwarnings('ignore') # silence annoying warnings
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

In [2]:
DATA_ROOT = "/media/felipe/ssd_vol/instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

# Any results you write to the current directory are saved as output.

# Import all the data
aisles = pd.read_csv(DATA_ROOT+'aisles.csv', engine='c')
departments = pd.read_csv(DATA_ROOT+'departments.csv', engine='c')
products = pd.read_csv(DATA_ROOT+'products.csv', engine='c')

op_prior = pd.read_csv(DATA_ROOT+'order_products__prior.csv', engine='c', 
                       dtype={'order_id': np.int32, 
                              'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 
                              'reordered': np.int8})
                              
op_train = pd.read_csv(DATA_ROOT+'order_products__train.csv', engine='c', 
                       dtype={'order_id': np.int32, 'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 'reordered': np.int8})

# orders
orders = pd.read_csv(DATA_ROOT+'orders.csv', engine='c', dtype={'order_id': np.int32, 
                                                           'user_id': np.int32, 
                                                           'order_number': np.int32, 
                                                           'order_dow': np.int8, 
                                                           'order_hour_of_day': np.int8, 
                                                           'days_since_prior_order': np.float16})

# test dataset (submission)
test = pd.read_csv(DATA_ROOT+'sample_submission.csv', engine='c')

### this is just like my own product_reorder_factor, but why use only the train set?

In [3]:
#Add a field to calculate the sum of times an item was reordered
products['rsum']=op_train.groupby('product_id')['reordered'].sum()
#Add a field to calculate the total times the item could have been reordered
products['rtotal']=op_train.groupby('product_id')['reordered'].count()
#Add a field to calculate the probability that the item was reordered
products['prob']=products['rsum']/products['rtotal']

In [4]:
products.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,rsum,rtotal,prob
0,1,Chocolate Sandwich Cookies,61,19,,,
1,2,All-Seasons Salt,104,13,49.0,76.0,0.644737
2,3,Robust Golden Unsweetened Oolong Tea,94,7,1.0,4.0,0.25
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,6.0,6.0,1.0
4,5,Green Chile Anytime Sauce,5,13,14.0,22.0,0.636364
5,6,Dry Nose Oil,11,11,1.0,1.0,1.0
6,7,Pure Coconut Water With Orange,98,7,,,
7,8,Cut Russet Potatoes Steam N' Mash,116,1,1.0,1.0,1.0
8,9,Light Strawberry Blueberry Yogurt,120,16,7.0,13.0,0.538462
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,2.0,5.0,0.4


In [5]:
#Merge all of the details into a goods page
goods = pd.merge(left=pd.merge(left=products, right=departments, how='left'), right=aisles, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()

In [6]:
goods.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,rsum,rtotal,prob,department,aisle
0,1,chocolate_sandwich_cookies,61,19,,,,snacks,cookies cakes
1,2,all-seasons_salt,104,13,49.0,76.0,0.644737,pantry,spices seasonings
2,3,robust_golden_unsweetened_oolong_tea,94,7,1.0,4.0,0.25,beverages,tea
3,4,smart_ones_classic_favorites_mini_rigatoni_wit...,38,1,6.0,6.0,1.0,frozen,frozen meals
4,5,green_chile_anytime_sauce,5,13,14.0,22.0,0.636364,pantry,marinades meat preparation
5,6,dry_nose_oil,11,11,1.0,1.0,1.0,personal care,cold flu allergy
6,7,pure_coconut_water_with_orange,98,7,,,,beverages,juice nectars
7,8,cut_russet_potatoes_steam_n'_mash,116,1,1.0,1.0,1.0,frozen,frozen produce
8,9,light_strawberry_blueberry_yogurt,120,16,7.0,13.0,0.538462,dairy eggs,yogurt
9,10,sparkling_orange_juice_&_prickly_pear_beverage,115,7,2.0,5.0,0.4,beverages,water seltzer sparkling water


In [7]:
# merge train and prior together iteratively, to fit into 8GB kernel RAM

from functools import partial

# initialize it with train dataset
order_details = pd.merge(
                left=op_train,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add order hierarchy
order_details = pd.merge(
                left=order_details,
                right=goods[['product_id', 
                             'aisle_id', 
                             'department_id',
                             'prob']].apply(partial(pd.to_numeric, ##Added the 'prob'
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)



In [8]:
order_details.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,prob
0,1,49302,1,1,112108,train,4,4,10,9,120,16,0.333333
1,1,11109,2,1,112108,train,4,4,10,9,108,16,
2,1,10246,3,0,112108,train,4,4,10,9,83,4,
3,1,49683,4,0,112108,train,4,4,10,9,83,4,0.0
4,1,43633,5,1,112108,train,4,4,10,9,95,15,0.46875
5,1,13176,6,0,112108,train,4,4,10,9,24,4,0.5
6,1,47209,7,0,112108,train,4,4,10,9,24,4,
7,1,22035,8,1,112108,train,4,4,10,9,21,16,0.115942
8,36,39612,1,0,79431,train,23,6,18,30,2,16,
9,36,19660,2,1,79431,train,23,6,18,30,115,7,0.45


In [9]:
print(order_details.shape, op_train.shape)

(1384617, 13) (1384617, 4)


In [10]:
# delete (redundant now) dataframes
del op_train

order_details.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,prob
0,1,49302,1,1,112108,train,4,4,10,9,120,16,0.333333
1,1,11109,2,1,112108,train,4,4,10,9,108,16,
2,1,10246,3,0,112108,train,4,4,10,9,83,4,
3,1,49683,4,0,112108,train,4,4,10,9,83,4,0.0
4,1,43633,5,1,112108,train,4,4,10,9,95,15,0.46875


In [11]:
# split df indexes into parts
indexes = np.linspace(0, len(op_prior), num=10, dtype=np.int32)

# update by small portions
for i in range(len(indexes)-1):
    order_details = pd.concat(
        [   
            order_details,
            pd.merge(left=pd.merge(
                            left=op_prior.loc[indexes[i]:indexes[i+1], :],
                            right=goods[['product_id', 
                                         'aisle_id', 
                                         'department_id' ]].apply(partial(pd.to_numeric, 
                                                                          errors='ignore', 
                                                                          downcast='integer')),
                            how='left',
                            on='product_id'
                            ),
                     right=orders, 
                     how='left', 
                     on='order_id'
                ) #.apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
        ]
    )
        
print('Datafame length: {}'.format(order_details.shape[0]))
print('Memory consumption: {:.2f} Mb'.format(sum(order_details.memory_usage(index=True, 
                                                                         deep=True) / 2**20)))
# check dtypes to see if we use memory effectively
print(order_details.dtypes)

# make sure we didn't forget to retain test dataset :D
test_orders = orders[orders.eval_set == 'test']

Datafame length: 33819114
Memory consumption: 3354.25 Mb
add_to_cart_order           int16
aisle_id                    int16
days_since_prior_order    float16
department_id                int8
eval_set                   object
order_dow                    int8
order_hour_of_day            int8
order_id                    int32
order_number                int32
prob                      float64
product_id                  int32
reordered                    int8
user_id                     int32
dtype: object


In [12]:
order_details.head(10)

Unnamed: 0,add_to_cart_order,aisle_id,days_since_prior_order,department_id,eval_set,order_dow,order_hour_of_day,order_id,order_number,prob,product_id,reordered,user_id
0,1,120,9.0,16,train,4,10,1,4,0.333333,49302,1,112108
1,2,108,9.0,16,train,4,10,1,4,,11109,1,112108
2,3,83,9.0,4,train,4,10,1,4,,10246,0,112108
3,4,83,9.0,4,train,4,10,1,4,0.0,49683,0,112108
4,5,95,9.0,15,train,4,10,1,4,0.46875,43633,1,112108
5,6,24,9.0,4,train,4,10,1,4,0.5,13176,0,112108
6,7,24,9.0,4,train,4,10,1,4,,47209,0,112108
7,8,21,9.0,16,train,4,10,1,4,0.115942,22035,1,112108
8,1,2,30.0,16,train,6,18,36,23,,39612,0,79431
9,2,115,30.0,7,train,6,18,36,23,0.45,19660,1,79431


In [13]:
# delete (redundant now) dataframes
del op_prior, orders

test_history = order_details[(order_details.user_id.isin(test_orders.user_id))]
last_orders = test_history.groupby('user_id')['order_number'].max()

In [16]:
test_history.head(10)

Unnamed: 0,add_to_cart_order,aisle_id,days_since_prior_order,department_id,eval_set,order_dow,order_hour_of_day,order_id,order_number,prob,product_id,reordered,user_id
112,1,27,1.0,5,prior,6,17,13,2,,17330,0,45082
113,2,51,1.0,13,prior,6,17,13,2,,27407,0,45082
114,3,124,1.0,5,prior,6,17,13,2,,35419,0,45082
115,4,77,1.0,7,prior,6,17,13,2,,196,0,45082
116,5,51,1.0,13,prior,6,17,13,2,,44635,0,45082
117,6,64,1.0,7,prior,6,17,13,2,,26878,0,45082
118,7,64,1.0,7,prior,6,17,13,2,,25783,0,45082
119,8,31,1.0,7,prior,6,17,13,2,,41290,0,45082
120,9,115,1.0,7,prior,6,17,13,2,,33198,0,45082
121,10,77,1.0,7,prior,6,17,13,2,,23020,0,45082


In [23]:
last_orders.reset_index().head()

Unnamed: 0,user_id,order_number
0,3,12
1,4,5
2,6,3
3,11,7
4,12,5


In [20]:
def get_last_orders_reordered():
    t = pd.merge(
            left=pd.merge(
                    left=last_orders.reset_index(),
                    right=test_history[test_history.reordered == 1],
                    how='left',
                    on=['user_id', 'order_number']
                )[['user_id', 'product_id']],
            right=test_orders[['user_id', 'order_id']],
            how='left',
            on='user_id'
        ).fillna(-1).groupby('order_id')['product_id'].apply(lambda x: ' '.join([str(int(e)) for e in set(x)]) 
                                                  ).reset_index().replace(to_replace='-1', 
                                                                          value='None')
    t.columns = ['order_id', 'products']
    return t

In [21]:
submission = get_last_orders_reordered()

In [22]:
submission.head(100)

Unnamed: 0,order_id,products
0,17,13107 21463
1,34,2596 47792 39475 16083 47766 44663 13176
2,137,25890 44422 5134 23794 24852 2326 29594 41787
3,182,5479 33000 39275 32109 47672 9337
4,257,27104 29837 37646 49235 24852 39475 35734 3023...
5,313,12779 13198 21903 45007 28535 30391 46906
6,353,40688 35561
7,386,15872 21479 28985 38281 45066 25513 39180 4075...
8,414,27705 20564 31215
9,418,13702 40268 5262 45364 47766 30489


In [None]:
# save submission
submission.to_csv('last_order_reordered_only.csv', 
                        encoding='utf-8', 
                        index=False)