In [None]:
import numpy as np
import pandas as pd
import pickle

import gc

import xgboost

from tqdm import *

In [None]:
DATA_ROOT = "/media/felipe/SSD_VOLUME//instacart/"
PICKLE_ROOT = DATA_ROOT+"pickles/"

In [None]:
aisles_df = pd.read_csv(DATA_ROOT+"aisles.csv")

In [None]:
last_items_ordered_df = pd.read_csv(DATA_ROOT+'/order_products__train.csv', engine='c', 
                       dtype={'order_id': np.int32, 'product_id': np.int32, 
                              'add_to_cart_order': np.uint8, 'reordered': np.uint8})

In [None]:
orders_df = pd.read_csv(DATA_ROOT+'/orders.csv', engine='c', dtype={'order_id': np.int32, 
                                                           'user_id': np.int32, 
                                                           'order_number': np.int32, 
                                                           'order_dow': np.uint8, 
                                                           'order_hour_of_day': np.uint8, 
                                                           'days_since_prior_order': np.float16})

In [None]:
orders_df['days_since_prior_order'] = orders_df['days_since_prior_order'].values.astype(np.uint8)

In [None]:
departments_df = pd.read_csv(DATA_ROOT+"/departments.csv")

In [None]:
products_df = pd.read_csv(DATA_ROOT+"/products.csv")

In [None]:
products_df['product_name'] = products_df['product_name'].astype('category')

In [None]:
prior_items_ordered_df = pd.read_csv(DATA_ROOT+'/order_products__prior.csv', engine='c', 
                       dtype={'order_id': np.int32, 
                              'product_id': np.int32, 
                              'add_to_cart_order': np.uint8, 
                              'reordered': np.uint8})

In [None]:
orders_df.sample(10)

In [None]:
users_df = orders_df[["user_id"]]
users_df = users_df.drop_duplicates().reset_index().drop('index',axis=1)
users_df.head()

## what is the static reorder_factor for each Product, for each product ever re-ordered by each user, in the test set?

In [None]:
EVAL_SET='test'

In [None]:
users_in_test_set_df = pd.merge(
    orders_df.query("eval_set == '{}'".format(EVAL_SET)),
    users_df,
    on='user_id',
    how='left'
)[["user_id"]]
users_in_test_set_df.head()

In [None]:
products_df['rsum']=last_items_ordered_df.groupby('product_id')['reordered'].sum()
#Add a field to calculate the total times the item could have been reordered
products_df['rtotal']=last_items_ordered_df.groupby('product_id')['reordered'].count()
#Add a field to calculate the probability that the item was reordered
products_df['prob']=products_df['rsum']/products_df['rtotal']

In [None]:
products_df.head()

In [None]:
#Merge all of the details into a goods page
goods = pd.merge(left=pd.merge(left=products_df, right=departments_df, how='left'), right=aisles_df, how='left')
# to retain '-' and make product names more "standard"
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()

In [None]:
goods.head(10)

In [None]:
# merge train and prior together iteratively, to fit into 8GB kernel RAM

from functools import partial

# initialize it with train dataset
order_details = pd.merge(
                left=prior_items_ordered_df,
                 right=orders_df, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

# add order hierarchy
order_details = pd.merge(
                left=order_details,
                right=goods[['product_id', 
                             'aisle_id', 
                             'department_id',
                             'prob']].apply(partial(pd.to_numeric, ##Added the 'prob'
                                                             errors='ignore', 
                                                             downcast='integer')),
                how='left',
                on='product_id'
)

In [None]:
order_details.head(10)

In [None]:
# split df indexes into parts
indexes = np.linspace(0, len(prior_items_ordered_df), num=10, dtype=np.int32)

# update by small portions
for i in range(len(indexes)-1):
    order_details = pd.concat(
        [   
            order_details,
            pd.merge(left=pd.merge(
                            left=prior_items_ordered_df.loc[indexes[i]:indexes[i+1], :],
                            right=goods[['product_id', 
                                         'aisle_id', 
                                         'department_id' ]].apply(partial(pd.to_numeric, 
                                                                          errors='ignore', 
                                                                          downcast='integer')),
                            how='left',
                            on='product_id'
                            ),
                     right=orders_df, 
                     how='left', 
                     on='order_id'
                ) #.apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))
        ]
    )
        
print('Datafame length: {}'.format(order_details.shape[0]))
print('Memory consumption: {:.2f} Mb'.format(sum(order_details.memory_usage(index=True, 
                                                                         deep=True) / 2**20)))
# check dtypes to see if we use memory effectively
print(order_details.dtypes)

# make sure we didn't forget to retain test dataset :D
test_orders = orders_df[orders_df.eval_set == 'test']

In [None]:
order_details = pd.merge(order_details,
                         users_in_test_set_df,
                         on='user_id',
                         how='right')

In [None]:
print('Datafame length: {}'.format(order_details.shape[0]))
print('Memory consumption: {:.2f} Mb'.format(sum(order_details.memory_usage(index=True, 
                                                                         deep=True) / 2**20)))
# check dtypes to see if we use memory effectively
print(order_details.dtypes)

# make sure we didn't forget to retain test dataset :D
test_orders = orders_df[orders_df.eval_set == 'test']

In [None]:
order_details.head(20)

In [None]:
sorted(set(order_details.order_id))[:20]