# Package Import - Review of tables

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

![CSV_NAMES](https://kaggle2.blob.core.windows.net/forum-message-attachments/183176/6539/instacartFiles.png)

# Import of Datasets & Merging

In [None]:
#engine='c' is used to faster read our .csv files
aisles = pd.read_csv('../input/aisles.csv' , engine='c')
departments = pd.read_csv('../input/departments.csv', engine='c')
products = pd.read_csv('../input/products.csv', engine='c')

#merge info of aisles & departments to products with a single request
goods = pd.merge(left=pd.merge(left=products, right=departments, how='left'), right=aisles, how='left')

#fix names to a clear format [replace spaces with underscores]
goods.product_name = goods.product_name.str.replace(' ', '_').str.lower()
goods.department = goods.department.str.replace(' ', '_').str.lower()
goods.aisle= goods.aisle.str.replace(' ', '_').str.lower()

goods.head()

In [None]:
#load orders
orders = pd.read_csv('../input/orders.csv', engine='c' )
orders.head()

In [None]:
#load prior
op_prior = pd.read_csv('../input/order_products__prior.csv', engine='c')
#load train                       
op_train = pd.read_csv('../input/order_products__train.csv', engine='c')
                       
#concatenate rows of train below prior:           
log= pd.concat([op_prior,op_train], ignore_index=1)                
log.tail()

In [None]:
# !--! runtime: 1m:14s
log.sort_values(['order_id', 'add_to_cart_order'], inplace=True)
log.reset_index(drop=1, inplace=True)
log = pd.merge(log, goods, on='product_id', how='left')
log = pd.merge(log, orders, on='order_id', how='left')
log['order_number_rev'] = log.groupby('user_id').order_number.transform(np.max) - log.order_number
gc.collect()
log.head()

In [None]:
log[log.eval_set == 'test']['order_id'].count()

> log DataFrame will be the main table to use for creating our new features

# Feature Engineering

In [None]:
gro= log.groupby('order_id')
order_size = gro.size().reset_index()
order_size.columns = ['order_id', 'total_products_of_order']
order_size.head()

In [None]:
log = pd.merge(log, order_size , on='order_id' , how='left')
log.head()

## Item Features  - groupby('product_id')

### The general groupby statement

In [None]:
#we indicate our desired groups (in this case, we create info for each product)
#we will reuse this grouping also in the next features
gr = log.groupby('product_id')

### Total times a product bought

In [None]:
#pro (for products) will be a hyper-DF to store all new features for products
#.to_frame converts the aggegated values for each product into a DF
pro = gr.product_id.count().to_frame()
pro.columns = ['total_purchases']

pro.head()

### Cart order of a product

In [None]:
#mean position in the add_to_cart of order

#we calculate the mean value of add to cart order for each product
#we chain .to_frame() to create a DF with info for each product
pro['item_mean_pos_cart'] = gr.add_to_cart_order.mean()

#now we create other metrics for each product [we use all known aggregation functions]:

#sum of orders
pro['item_sum_pos_cart'] = gr.add_to_cart_order.sum()
#min value [the best place appeared on a cart order]
pro['item_min_pos_cart'] = gr.add_to_cart_order.min()
#max [the worst place appeared on a cart order]
pro['item_max_pos_cart'] = gr.add_to_cart_order.max()
#median
pro['item_median_pos_cart'] = gr.add_to_cart_order.median()
#standard deviation - how dispersed is the order 
pro['item_std_pos_cart'] = gr.add_to_cart_order.std()

pro.head(10)



In [None]:
pro.reset_index(level=0, inplace=True)
pro.head()

### Days since last order for an item

In [None]:
#dropna removes first order of a user
#runtime : 3m:16s
#average
dslo = log.dropna(axis=0).groupby('product_id').days_since_prior_order.mean().to_frame()
dslo.columns = ['days_since_last_order_product_mean']
#max
dslo['days_since_last_order_product_max'] = log.dropna(axis=0).groupby('product_id').days_since_prior_order.max().to_frame()
#min
dslo['days_since_last_order_product_min'] = log.dropna(axis=0).groupby('product_id').days_since_prior_order.min().to_frame()
#sum
dslo['days_since_last_order_product_sum'] = log.dropna(axis=0).groupby('product_id').days_since_prior_order.sum().to_frame()
#median
dslo['days_since_last_order_product_median'] = log.dropna(axis=0).groupby('product_id').days_since_prior_order.median().to_frame()
#standard deviation
dslo['days_since_last_order_product_std'] = log.dropna(axis=0).groupby('product_id').days_since_prior_order.std().to_frame()
dslo.reset_index(level=0, inplace=True)
dslo.head()

In [None]:
#we merge the features with hyper-DF "pro"
pro = pd.merge(pro, dslo, on='product_id', how='left')
pro.head()

### Calculate one time only bought ratio of a product

In [None]:
#runtime : 35s
item_users = log.groupby(['product_id', 'user_id']).size().reset_index()
item_users.columns = ['product_id', 'user_id', 'total']
item_users[item_users.total==1].head()

In [None]:
# how many times an item bought and it was the first in the card list
item_one = item_users[item_users.total==1].groupby('product_id').size().reset_index()
item_one.columns = ['product_id', 'item_only_one_user_total']
item_one.head()

In [None]:
#define a ratio by dividing with the total number of purchases [already calculated in the pro hyper-DF]
item_one['ratio_firstorder_to_all'] = item_one['item_only_one_user_total']/ pro['total_purchases']
item_one.head()

>  *we can follow the same procedure (cumulative or single) for 2nd to 5th time ordered*

In [None]:
#merge to the hyper-DF
pro = pd.merge(pro, item_one, how='left')
pro.head()

## Features of  Products  by   order_hour_of_day features

### Total orders of a product for a given hour

In [None]:
product_hour1 = log.groupby(['product_id', 'order_hour_of_day']).size().reset_index()
product_hour1.columns = ['product_id', 'order_hour_of_day', 'item_hour_cnt']
product_hour1.head(25)

In [None]:
product_hour1['item_hour_ratio'] = product_hour1.item_hour_cnt / product_hour1.groupby('product_id').transform(np.sum).item_hour_cnt
product_hour1.head()

In [None]:
### Total unique orders of a product for a given hour. (drop orders of the same hour from same users)

In [None]:
product_hour2 = log.drop_duplicates(['user_id', 'product_id', 'order_hour_of_day']).groupby(['product_id', 'order_hour_of_day']).size().reset_index()
product_hour2.columns = ['product_id', 'order_hour_of_day', 'item_hour_cnt_unq']
product_hour2['item_hour_ratio_unq'] = product_hour2.item_hour_cnt_unq / product_hour2.groupby('product_id').transform(np.sum).item_hour_cnt_unq
product_hour2.head()

In [None]:
product_hour= pd.merge(product_hour1, product_hour2)
product_hour.head()

## Features of  Products  by  order_dow (days) features

### Total orders of a product for a given day

In [None]:
product_day1 = log.groupby(['product_id', 'order_dow']).size().reset_index()
product_day1.columns = ['product_id', 'order_dow', 'item_dow_cnt']
product_day1['item_dow_ratio'] = product_day1.item_dow_cnt / product_day1.groupby('product_id').transform(np.sum).item_dow_cnt

In [None]:
product_day2 = log.drop_duplicates(['user_id', 'product_id', 'order_dow']).groupby(['product_id', 'order_dow']).size().reset_index()
product_day2.columns = ['product_id', 'order_dow', 'item_dow_cnt_unq']
product_day2['item_dow_ratio_unq'] = product_day2.item_dow_cnt_unq / product_day2.groupby('product_id').transform(np.sum).item_dow_cnt_unq    

In [None]:
product_day= pd.merge(product_day1, product_day2)
product_day.head()

# More to explore:
## Order Features  - groupby('order_id')

In [None]:
gro= log.groupby('order_id')
order_size = gro.size().reset_index()
order_size.columns = ['order_id', 'total_products_of_order']
order_size.head()

In [None]:
order_product= log.groupby(['user_id', 'product_id']).size().reset_index()
order_product.columns= ['user_id', 'product_id', 'times']

In [None]:
order_product.head()

In [None]:
order_product_choice = order_product.groupby('product_id').times.max().to_frame().reset_index()
order_product_choice.head()

# Merge all features to order

In [None]:
#final = log.copy()
#final = pd.merge(final, pro, how='left', on='product_id')
#final = pd.merge(final, product_hour, how='left', on=['product_id', 'order_hour_of_day'])
#final = pd.merge(final, product_day, how='left', on=['product_id', 'order_dow'])

#final = pd.merge(final, order_size, how='left', on='order_id')
#final = pd.merge(final, order_product, how='left', on=['user_id', 'product_id'])
#final = pd.merge(final, order_product_choice, how='left', on='product_id')

#final.head(10)