In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data_path = '../input'

df_aisles = pd.read_csv(data_path + '/aisles.csv')
df_departments = pd.read_csv(data_path + '/departments.csv')
df_products = pd.read_csv(data_path + '/products.csv')

df_orders = pd.read_csv(data_path + '/orders.csv')
df_ord_prod_train = pd.read_csv(data_path + '/order_products__train.csv')
df_ord_prod_prior = pd.read_csv(data_path + '/order_products__prior.csv')

# Orders


*orders.csv* contains 3,421,083 rows, with one order per row (without products).

The csv is sorted by (user_id, order_number).

Important columns:
- **user_id**: Each user has at least 4 orders, and data is truncated to 100 max. orders per user
- **order_number**: 1st order for each user has order_number == 1, 2nd == 2, etc. 
- **eval_set**: prior (3,214,874), train (131,209), test (75,000)
  - *prior*: all orders except the last one
  - *train*, *test*: last order for a user. for each user the last order is either flagged as train or test.

In [None]:
print("df_orders shape: {}".format(df_orders.shape))
print(df_orders.groupby('eval_set').size())
df_orders.head(15)

In [None]:
df_orders[df_orders.user_id <= 10].groupby(['user_id','eval_set']).size().unstack(fill_value=0)

In [None]:
plt.figure(figsize=(20,8))
ax = sns.countplot(df_orders['user_id'].value_counts())

# Order Products

- prior orders: total of 32,434,489 ordered products
- train orders: total of 1,384,617 ordered products

For working with order products, it is probably more convenient to append both DataFrames and add an eval_set column.

In [None]:
print("df_ord_prod_train shape: {}".format(df_ord_prod_train.shape))
print("df_ord_prod_prior shape: {}".format(df_ord_prod_prior.shape))

In [None]:
df_ord_prod_prior.head(3)

In [None]:
df_ord_prod_prior['eval_set'] = 'prior'
df_ord_prod_train['eval_set'] = 'train'
df_order_products = df_ord_prod_prior.append(df_ord_prod_train, ignore_index=True)

In [None]:
cnt_products_per_order = df_order_products.groupby('order_id').size()
plt.figure(figsize=(20,8))
sns.countplot(cnt_products_per_order)
xt = plt.xticks(rotation='vertical')

## Aisles, Departments and Products

- products, aisles, and departments are classic dimensional entities
- there are 49,688 products, 134 aisles and 21 departments
- since products.csv contains aisle and department ids, they can be joined in one products dataframe

In [None]:
print("df_aisles shape: {}".format(df_aisles.shape))
print("df_departments shape: {}".format(df_departments.shape))
print("df_products shape: {}".format(df_products.shape))

In [None]:
df_products = df_products.merge(df_aisles).merge(df_departments)
df_products.head()