In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Problem Statement

**Objective**: "Use anonymized data on customer orders over time to predict which previously purchased products will be in a userâ€™s next order."

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import zipfile

import matplotlib.pyplot as plt
import seaborn as sns

## 2. Load and View the Data

In [None]:
# extract and read files
data_path = "/kaggle/input/instacart-market-basket-analysis/"
data_folder = os.listdir(data_path)
missing_value_formats = ["n.a.","?","NA","n/a", "na", "--","-"]

for file in data_folder:
    if file.endswith('.zip'):
            with zipfile.ZipFile(data_path+file, 'r') as zip_ref:
                zip_ref.extractall("kaggle/working/")

data = {}
for file in os.listdir("kaggle/working/"):
    if file.endswith('.csv'):
        data[file[:file.find('.')]] = pd.read_csv("kaggle/working/"+file, na_values = missing_value_formats)
        
data.keys()
# dict_keys(['orders', 'order_products__prior', 'sample_submission', 'products', 
# 'departments', 'aisles', 'order_products__train'])

In [None]:
# have a look at all the files

def summarize_data(df):
    print("\nOverview")
    display(df.head())
    print("\nSummary")
    display(df.describe(include='all'))
    print("\nNull Values")
    display(df.isnull().sum()/len(df))

for file in data.keys():
    print("\n\n",file)
    summarize_data(data[file])

<br>

**Overview of Tables**

| Table Name | Column Names | Description | Hypothesis for Data Exploration  |
| --- | --- | --- | --- | 
| aisles | aisle_id, aisle | 134 unique IDs, names for different aisles at Instacart | aisle which generates highest revenue, most frequently used aisle, distribution of<br> aisle usage based on demographics and other customer info, is there a trend of<br> aisle popularity with time of day, day of week, or any specific month of year? |
| departments | department_id, department | 21 unique IDs, names for different departments including 'missing'<br> - looks like the rolled-up metric for aisles e.g. one department might <br> have multiple aisles | revenue by dept, frequency of use by dept, distribution of dept usage based<br> on demographics and other available customer info, dept popularity by time <br>of day, day of week, or any specific month of year  |
| order-products-prior | order_id, product_id, add_to_cart_order, reordered | 32,434,489 rows at order-product level with 3,214,874 unique orders <br>for 49,677 unique products. 'add_to_cart_order' shows the order in which<br> they were added to the cart and 'reordered' | most ordered products, most frequent re-ordered products, products which <br>are only ordered once and not reordered, Number of products in one order,<br> are the products that are ordered together from the same aisle/dept? |
 | order-products-train | order_id, product_id, add_to_cart_order, reordered | Similar to df_prior_orders but only has latest order information. 1,384,617 rows<br> with 131,209 unique order IDs and 39,123  | most ordered products, most frequent re-ordered products, products <br>which are only ordered once and not reordered, Number of products in <br>one order, are the products that are ordered together from the same aisle/dept? |
| orders | order_id, user_id, eval_set,<br> order_number, order_dow,<br> order_hour_of_day, days_since_prior_order | 3,421,083 orders showing information on order ID, user ID, which evaluation<br> dataset the order is in (prior, train, test), day of week, hour of day,<br> days since prior order | Most popular day & time for placing an order, trend of day & time by products |
| products | product_id, product_name, aisle_id, department_id | 49,688 rows mapping products to aisles and departments |	covered in above metrics |

<br>

**Missing Values**

* 6% values are missing for 'days since prior order' in 'orders' df --> Given these are only for Order Number 1, we can replace missing values with '0' days
* No other missing values found in the data
* FYI - We can use Imputers from sklearn in couldn't replace our missing data with 0



## 3. Master Dataset

In [None]:
# Prior and Train order datasets have the same columns orders from different time frame - These need to be concatenated

master_df = pd.concat([data['order_products__prior'], data['order_products__train']]).sort_values(by=['order_id'])

# Merge the rest of the datasets
master_df = pd.merge(left = master_df, right = data['products'],
                             left_on='product_id', right_on='product_id').sort_values(by=['order_id']).reset_index(drop=True)
master_df = pd.merge(left = master_df, right = data['aisles'],
                             left_on='aisle_id', right_on='aisle_id').sort_values(by=['order_id']).reset_index(drop=True)
master_df = pd.merge(left = master_df, right = data['departments'],
                             left_on='department_id', right_on='department_id').sort_values(by=['order_id']).reset_index(drop=True)
master_df = pd.merge(left = master_df, right = data['orders'],
                             left_on='order_id', right_on='order_id').sort_values(by=['order_id']).reset_index(drop=True)

col_order = ['user_id','order_id','product_id','aisle_id','department_id','add_to_cart_order',
 'reordered','product_name','aisle','department','eval_set','order_number','order_dow','order_hour_of_day',
 'days_since_prior_order']

master_df = master_df[col_order]

del data

summarize_data(master_df)

## 4. EDA

In [None]:
# Identify Primary Key
len(master_df.groupby(['user_id','order_id','product_id'], as_index=False).count())

In [None]:
# Identify unique values in columns
print ("\nNumber of Rows : ", len(master_df))
print ("Unique user_id: ", master_df['user_id'].nunique(),", % :", f"{master_df['user_id'].nunique() / len(master_df): .2%}")
print ("Unique order_id: ", master_df['order_id'].nunique(),", % :", f"{master_df['order_id'].nunique() / len(master_df): .2%}")
print ("Unique product_id: ", master_df['product_id'].nunique(),", % :", f"{master_df['product_id'].nunique() / len(master_df): .2%}")
print ("Unique aisle_id: ", master_df['aisle_id'].nunique(),", % :", f"{master_df['aisle_id'].nunique() / len(master_df): .2%}")
print ("Unique department_id: ", master_df['department_id'].nunique(),", % :", f"{master_df['department_id'].nunique() / len(master_df): .2%}")
print ("Unique add_to_cart_order: ", master_df['add_to_cart_order'].nunique(),", % :", f"{master_df['add_to_cart_order'].nunique() / len(master_df): .2%}")
print ("Unique reordered: ", master_df['reordered'].nunique(),", % :", f"{master_df['reordered'].nunique() / len(master_df): .2%}")
print ("Unique product_name: ", master_df['product_name'].nunique(),", % :", f"{master_df['product_name'].nunique() / len(master_df): .2%}")
print ("Unique aisle: ", master_df['aisle'].nunique(),", % :", f"{master_df['aisle'].nunique() / len(master_df): .2%}")
print ("Unique department: ", master_df['department'].nunique(),", % :", f"{master_df['department'].nunique() / len(master_df): .2%}")
print ("Unique eval_set: ", master_df['eval_set'].nunique(),", % :", f"{master_df['eval_set'].nunique() / len(master_df): .2%}")
print ("Unique order_number: ", master_df['order_number'].nunique(),", % :", f"{master_df['order_number'].nunique() / len(master_df): .2%}")
print ("Unique order_dow: ", master_df['order_dow'].nunique(),", % :", f"{master_df['order_dow'].nunique() / len(master_df): .2%}")
print ("Unique order_hour_of_day: ", master_df['order_hour_of_day'].nunique(),", % :", f"{master_df['order_hour_of_day'].nunique() / len(master_df): .2%}")
print ("Unique days_since_prior_order: ", master_df['days_since_prior_order'].nunique(),", % :", f"{master_df['days_since_prior_order'].nunique() / len(master_df): .2%}")

In [None]:
# look at the heirarchy of products, aisles and departments

master_df.groupby(['department','aisle','product_name'], as_index = False).size()

In [None]:
############################################ Helper Function 3 ############################################################
############################################ VISUALIZE DATA ##############################################################

def CreateCharts (ax, data, x, y, chart_type, legend = False, size = 5, hue = None, palette=None):
    if chart_type == "scatter":
        plot = sns.scatterplot(data=data, x=x, y=y, size=size, legend=legend, 
          hue=hue, sizes=(20, 200), palette = palette, ax=ax)
    elif chart_type == "bar":
        plot = sns.barplot(x=x, y=y, data=data, 
          hue=hue, palette = palette, ax=ax, ci="sd")
    elif chart_type == "density":
        plot = sns.kdeplot(x=x, data=data, 
          shade=True, alpha=0.5, ax=ax)  
    elif chart_type == "swarm":
        plot = sns.swarmplot(x=x, y=y, hue=hue, data=data,
                palette=palette, ax=ax)
    elif chart_type =='hist':
        plot = sns.histplot(data=data, x=x, kde=True, ax = ax, palette = palette )
    return plot

In [None]:

sns.set(style="darkgrid")
fig, ax = plt.subplots(4,3, figsize=(25,15))

# Plot 1
data = master_df[['user_id','order_id']].drop_duplicates().groupby('user_id').size().reset_index(name='Number of Orders').sort_values(by='Number of Orders', ascending=False)
ax[0,0] = CreateCharts(ax[0,0], data, x = "Number of Orders", y=None, chart_type = "hist", palette="bright")
ax[0,0].set_title('How many orders are made by a single user?', fontsize=10)


# Plot 2
data = master_df.groupby('order_id').size().reset_index(name='Number of Products').sort_values(by='Number of Products', ascending=False)
ax[0,1] = CreateCharts(ax[0,1], data, x = "Number of Products", y=None, chart_type = "hist", palette="dark")
ax[0,1].set_title('How many products are in a single order?', fontsize=10)


# Plot 3
import matplotlib.patches as mpatches
data1 = master_df.groupby('days_since_prior_order', as_index=False).size()
bar1 = sns.barplot(x="days_since_prior_order",  y="size", data=data1, color='darkblue', ax=ax[0,2])
data2 = master_df[master_df['reordered']==1].groupby('days_since_prior_order', as_index=False).size()
bar2 = sns.barplot(x="days_since_prior_order",  y="size", data=data2,  color='lightblue', ax=ax[0,2])
top_bar = mpatches.Patch(color='darkblue', label='total orders')
bottom_bar = mpatches.Patch(color='lightblue', label='reordered')
ax[0,2].legend(handles=[top_bar, bottom_bar])
ax[0,2].set_title('How many days have passed since the last order?', fontsize=10)
ax[0,2].set_ylabel('count')


# Plot 4

data1 = master_df.groupby('order_dow', as_index=False).size()
bar1 = sns.barplot(x="order_dow",  y="size", data=data1, color='maroon', ax=ax[1,0])
data2 = master_df[master_df['reordered']==1].groupby('order_dow', as_index=False).size()
bar2 = sns.barplot(x="order_dow",  y="size", data=data2,  color='yellow', ax=ax[1,0])
top_bar = mpatches.Patch(color='maroon', label='total orders')
bottom_bar = mpatches.Patch(color='yellow', label='reordered')
ax[1,0].legend(handles=[top_bar, bottom_bar])
ax[1,0].set_title('Which days have the most orders/reorders?', fontsize=10)
ax[1,0].set_ylabel('count')

# Plot 5
data1 = master_df.groupby('order_hour_of_day', as_index=False).size()
bar1 = sns.barplot(x="order_hour_of_day",  y="size", data=data1, color='purple', ax=ax[1,1])
data2 = master_df[master_df['reordered']==1].groupby('order_hour_of_day', as_index=False).size()
bar2 = sns.barplot(x="order_hour_of_day",  y="size", data=data2,  color='lightgreen', ax=ax[1,1])
top_bar = mpatches.Patch(color='purple', label='total orders')
bottom_bar = mpatches.Patch(color='lightgreen', label='reordered')
ax[1,1].legend(handles=[top_bar, bottom_bar])
ax[1,1].set_title('Which hour of day has the most orders/reorders?', fontsize=10)
ax[1,1].set_ylabel('count')

# Plot 6
data = master_df.groupby('order_id', as_index = False)['add_to_cart_order'].max()
ax[1,2] = CreateCharts(ax[1,2], data, x = "add_to_cart_order", y=None, chart_type = "hist", palette="bright")
ax[1,2].set_title('What is the cart size across orders?', fontsize=10)


# Plot 7
data = master_df.groupby('product_name', as_index = False)['product_id'].count().sort_values(by='product_id', ascending=False)
ax[2,0] = CreateCharts(ax[2,0], data.head(25), "product_name", "product_id", "bar", palette = "Set2")
ax[2,0].set_xticklabels(data.head(25)['product_name'], rotation = 90)
ax[2,0].set_title('What are the most ordered products?', fontsize=10)
ax[2,0].set_xlabel('')
ax[2,0].set_ylabel('count')

# Plot 8
data = master_df.groupby('aisle', as_index = False)['aisle_id'].count().sort_values(by='aisle_id', ascending=False)
ax[2,1] = CreateCharts(ax[2,1], data.head(25), "aisle", "aisle_id", "bar", palette = "Set2")
ax[2,1].set_xticklabels(data.head(25)['aisle'], rotation = 90)
ax[2,1].set_title('Which aisle do users order the most from?', fontsize=10)
ax[2,1].set_xlabel('')
ax[2,1].set_ylabel('count')

# Plot 9
data = master_df.groupby('department', as_index = False)['department_id'].count().sort_values(by='department_id', ascending=False)
ax[2,2] = CreateCharts(ax[2,2], data.head(25), "department", "department_id", "bar", palette = "Set2")
ax[2,2].set_xticklabels(data.head(25)['department'], rotation = 90)
ax[2,2].set_title('Which department do users order the most from?', fontsize=10)
ax[2,2].set_xlabel('')
ax[2,2].set_ylabel('count')


# Plot 10
data = master_df.groupby('product_name', as_index = False)['reordered'].sum().sort_values(by = 'reordered', ascending=False)
ax[3,0] = CreateCharts(ax[3,0], data.head(25),'product_name' , "reordered", "bar", palette = "hls")
ax[3,0].set_xticklabels(data.head(25)['product_name'], rotation = 90)
ax[3,0].set_title('What are the most reordered products?', fontsize=10)
ax[3,0].set_xlabel('')

# Plot 11
data = master_df.groupby('aisle', as_index = False)['reordered'].sum().sort_values(by = 'reordered', ascending=False)
ax[3,1] = CreateCharts(ax[3,1], data.head(25),'aisle' , "reordered", "bar", palette = "hls")
ax[3,1].set_xticklabels(data.head(25)['aisle'], rotation = 90)
ax[3,1].set_title('Which aisle has the most reorders?', fontsize=10)
ax[3,1].set_xlabel('')

# Plot 12
data = master_df.groupby('department', as_index = False)['reordered'].sum().sort_values(by = 'reordered', ascending=False)
ax[3,2] = CreateCharts(ax[3,2], data.head(25),'department' , "reordered", "bar", palette = "hls")
ax[3,2].set_xticklabels(data.head(25)['department'], rotation = 90)
ax[3,2].set_title('Which department has the most reorders?', fontsize=10)
ax[3,2].set_xlabel('')


plt.subplots_adjust(left=0.125,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.2, 
                    hspace=1.5)

plt.show()

In [None]:
# Correlation
plt.figure(figsize=(10,8))
plt.title('Pearson Correlation of Features', size = 15)
colormap = sns.diverging_palette(10, 220, as_cmap = True)
sns.heatmap(master_df.corr(),
            cmap = colormap,
            square = True,
            annot = True,
            linewidths=0.1,vmax=1.0, linecolor='white',
            annot_kws={'fontsize':12 })
plt.show()

## 4. Feature Engg.

# create a 100 features using FE tech

In [None]:
############################################ Helper Function 4 ############################################################
################################################# RUN PCA #################################################################

def run_PCA(data, features, name):
  pca = PCA(n_components=features)
  scale_data = pd.DataFrame(scale(data), columns = data.columns, index = data.index)
  pca_output = pca.fit_transform(data)
  df_pca = pd.DataFrame(data = pca_output, columns = [name+str(i) for i in range(features)], index = data.index)
  plot(pca.explained_variance_ratio_.cumsum(), linewidth=2)
  print("\nImportance of Components")
  print(pd.DataFrame(data = pca.components_, columns = data.columns, index = ['prod_'+str(i) for i in range(features)]))
  return df_pca

In [None]:
# PCA to find product preference
prod_pref = pd.pivot_table(master_df.groupby(['user_id','aisle'], as_index=False).size(), values='size', index='user_id',
                    columns=['aisle'], aggfunc=np.sum, fill_value=0)
prod_pref = run_PCA(prod_pref, 10, "prod_pref")

# PCA to find day preference
# day_pref = pd.pivot_table(master_df, values = 'order_dow', index = 'user_id',
#                columns = ['department'], aggfunc=np.median, fill_value=-1)

# day_pref = run_PCA(day_pref, 10, "day_pref")

In [None]:
dept_pref = pd.pivot_table(master_df.groupby(['user_id','department'], as_index=False).size(), values='size', index='user_id',
                    columns=['department'], aggfunc=np.sum, fill_value=0)
dept_pref['total'] = dept_pref.sum(axis = 1)

for dept in dept_pref.columns:
  dept_pref[dept+'_perc'] = dept_pref[dept] / dept_pref['total']

dept_pref

In [None]:
# Create features to demonstrate trend of products prefered by users
model_input = prod_pref.merge(dept_pref, how='left', on='user_id')
# Create features to demonstrate trends of orders made by users
model_input = model_input.merge(master_df.groupby('user_id', as_index = False).agg(
              {'order_id':'count',
               'product_id':'count',
               'days_since_prior_order':'mean',
               'add_to_cart_order':'median',
               }).rename(
              columns={'order_id':'total_orders',
                       'product_id':'total_products',
                       'days_since_prior_order':'mean_days_since_prior_order',
                       'add_to_cart_order':'median_cart_size'}),
               how='left', on='user_id') 
model_input = model_input.set_index('user_id') 
model_input

In [None]:
## Make data suitable for chosen model
import warnings
warnings.filterwarnings('ignore')

# check for outliers 
for col in model_input.columns:
  cap_val =  (np.mean(model_input[col]) + 3*np.std(model_input[col]))
  model_input[col+"_capped"] = 0
  model_input[col+"_capped"][model_input[col] >= 0 ] = np.minimum(cap_val,model_input[col][model_input[col] >= 0 ])
  model_input[col+"_capped"][model_input[col] < 0 ] = np.maximum((-1)*cap_val,model_input[col][model_input[col] < 0 ])
  print("\nOutliers", col)
  print ("cap: ", cap_val)
  print ("capped positive:", sum (model_input[col][model_input[col] >= 0 ] > cap_val))
  print ("capped negative:", sum (model_input[col][model_input[col] < 0 ] < (-1)*cap_val))


# normalize data
scaler = StandardScaler()
model_input_scaled = pd.DataFrame(scaler.fit_transform(model_input),columns = model_input.columns, index = model_input.index)

In [None]:
# final dataset

X = model_input_scaled[['prod_pref0_capped',
       'prod_pref1_capped', 'prod_pref2_capped', 'prod_pref3_capped',
       'prod_pref4_capped', 'prod_pref5_capped', 'prod_pref6_capped', 
       'prod_pref7_capped', 'prod_pref8_capped', 'prod_pref9_capped',
       'alcohol_perc', 'babies_perc', 'bakery_perc', 'beverages_perc',
       'breakfast_perc', 'bulk_perc', 'canned goods_perc', 'dairy eggs_perc',
       'deli_perc', 'dry goods pasta_perc', 'frozen_perc', 'household_perc',
       'international_perc', 'meat seafood_perc', 'missing_perc', 'other_perc',
       'pantry_perc', 'personal care_perc', 'pets_perc', 'produce_perc',
       'snacks_perc', 'total_orders_capped',
       'total_products_capped', 'mean_days_since_prior_order_capped',
       'median_cart_size_capped']]

# sns.pairplot(X)