In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
sns.set()
%matplotlib inline

In [None]:
df= []
for file in os.listdir('../input/'):
    command = file[:-4] + ' = pd.read_csv(\'../input/'  + file + '\')'
    exec(command)
    df.append(file[:-4])
print(df)

In [None]:
order_products_prior = order_products__prior.copy()
order_products_train = order_products__train.copy()

In [None]:
orders.head()

In [None]:
orders.describe()

orders.csv has information about orders: user_id of purchaser, order day, order time, days since prior order.


In [None]:
orders['eval_set'].value_counts()

The 'eval-set' column in orders.csv has three unique values.  It indicates if the row belongs to train, test or prior order.

In [None]:
orders.groupby('eval_set')['user_id'].apply(lambda x: len(np.unique(x)))

There are a total of 206209 customers and their orders are split as 131209 for 
training and 75000 for testing

In [None]:
order_counts_by_user = orders.groupby('user_id')['order_number'].count().reset_index()['order_number'].value_counts()
plt.figure(figsize=(24,12))
sns.barplot(order_counts_by_user.index, order_counts_by_user.values)

plt.ylabel('Counts', fontsize=20)
plt.xlabel('Number of Orders', fontsize=20)
plt.xticks(rotation=90, fontsize=16)
plt.show()

In [None]:
#Add a new column with weekday for plots
orders['Weekday'] = orders['order_dow'].map({1:'Sun', 2:'Mon', 3:'Tue', 4:'Wed', 5:'Thu',6:'Fri',0:'Sat'})

# Plot to show how orders are distributed over the days of week
plt.figure(figsize=(12,8))
sns.countplot('Weekday', data=orders.sort_values('order_dow'), color='green')
plt.ylabel('Counts', fontsize=22)
plt.xlabel('Day of Week', fontsize=22)
plt.title('Number of Orders vs Day of Week', fontsize=30)
plt.yticks(fontsize=16)
plt.xticks(rotation=90, fontsize=16)
plt.show()

Saturdays and Sundays have the most orders
Tuesdays and Wednesdays have the least orders

In [None]:
# Plot to show how orders are distributed over the hours of day
plt.figure(figsize=(12,10))
sns.countplot('order_hour_of_day', data=orders, color='blue',alpha=0.5)
plt.ylabel('Counts', fontsize=22)
plt.xlabel('Hour of Day', fontsize=22)
plt.title('Number of Orders vs Hour of Day', fontsize=30)
plt.yticks(fontsize=16)
plt.xticks(rotation=90, fontsize=16)
plt.axvline(10, color='red', linewidth=3)
plt.axvline(16, color='red', linewidth=3)
plt.show()

Most orders are between 10 Am and 4 PM 

In [None]:
grouped = orders.groupby(['Weekday', 'order_hour_of_day'])['order_number'].aggregate('count').reset_index()
grouped = grouped.pivot('Weekday', 'order_hour_of_day', 'order_number')
plt.figure(figsize=(12,10))
sns.heatmap(grouped, cmap='BuGn' )
plt.xlabel('Hour of Day', fontsize=16)
plt.ylabel('Day of Week', fontsize=16)
plt.yticks(rotation=0)
plt.show()

Saturdays afternoons and Sunday before noons sees a large number of orders

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(orders['days_since_prior_order'],color='purple', alpha=0.3)
plt.ylabel('Counts', fontsize=16)
plt.xlabel('Days Since Previous Order', fontsize=16)
plt.xticks(rotation=90)
plt.show()

Customers order every 30 days most of the time.  
The next highest frequency is 7 days

The dataframe order_products_prior should be merged with aisles, products and departments

In [None]:
aisles.head()

In [None]:
products.head()

In [None]:
departments.head()

In [None]:
order_products_prior.head()

In [None]:
# order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='left')
# order_products_prior = pd.merge(order_products_prior, aisles, on='aisle_id', how='left')
# order_products_prior = pd.merge(order_products_prior, departments, on='department_id', how='left')
# order_products_prior.head()


def merge(df):
    df = pd.merge(df, products, on='product_id', how='left')
    df = pd.merge(df, aisles, on='aisle_id', how='left')
    df = pd.merge(df, departments, on='department_id', how='left')
    return df

order_products_prior = merge(order_products_prior)
order_products_train = merge(order_products_train)

In [None]:
def high_frequency_plot(col):
    plt.figure(figsize=(14,10))
    order_products_prior[col].value_counts().sort_values(ascending=False).head(25).plot(kind='bar')
    plt.title('Frequency distribution', fontsize=20)
    plt.xticks(fontsize=18)
    plt.ylabel('Counts', fontsize=18)
    plt.show()

In [None]:
high_frequency_plot('product_name')

## Organic fruits and vegetables are the most ordered items

In [None]:
high_frequency_plot('aisle')

## Fresh fruits and vegetables aisles have the highest frequency just as was observed in the 'product_name' plot

In [None]:
high_frequency_plot('department')

## Produce department had the highest orders, followed by dairy & eggs department.  Dairy & Eggs were consolidated into one department.  "Missing" is also listed among the top 20 departments.  It will need to be addressed.

In [None]:
temp_df = pd.DataFrame(order_products_prior['department'].value_counts().sort_values(ascending=False)).head(10)
temp_df['fraction_of_total'] = temp_df['department']/temp_df['department'].sum()*100
plt.figure(figsize=(10,10))
plt.pie(temp_df['fraction_of_total'], labels=temp_df.index, autopct='%1.1f%%')
plt.title('Department Distribution', fontsize=18)
plt.show()

Top 10 Department distribution shown as a pie chart

## Lets explore how products, aisles, and departments affect the reorder ratio 

In [None]:
temp_df = order_products_prior.groupby('product_name')['reordered'].mean().sort_values(ascending=False)

plt.figure(figsize=(12,10))
sns.pointplot(temp_df.head(20).index,temp_df.head(20))
plt.xlabel('Products', fontsize=18)
plt.ylabel('Reorder Ratio', fontsize=(18))
plt.xticks(rotation=90,fontsize=12)
plt.show()

print('Number of products never reordered = ', (temp_df==0.0).sum())

## Raw veggie wrappers, Overnight pads, Energy Shots, Chocolate Love Bars and Soy Protein Infant formula are top 5 reordered products 
## 4,372 products were never reordered

In [None]:
temp_df = order_products_prior.groupby('aisle')['reordered'].mean().sort_values(ascending=False)

plt.figure(figsize=(12,10))
sns.pointplot(temp_df.head(50).index,temp_df.head(50))
plt.xlabel('Aisles', fontsize=18)
plt.ylabel('Reorder Ratio', fontsize=(18))
plt.xticks(rotation=90,fontsize=12)
plt.show()

In [None]:
temp_df = order_products_prior.groupby('department')['reordered'].mean().sort_values(ascending=False)

plt.figure(figsize=(12,10))
sns.pointplot(temp_df.index,temp_df)
plt.xlabel('department', fontsize=18)
plt.ylabel('Reorder Ratio', fontsize=(18))
plt.xticks(rotation=90,fontsize=12)
plt.show()