In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Hi folks, welcome to another Kaggle notebook. Today I will be doing another extensive exploratory data anlaysis project from scratch. I found this lovely Instacart dataset just a few moments ago before launching this kernal. As with all my EDA projects, my goals is to figure out way to tell some sort of story from the data. The data is extensive, so I think we can have a lot of fun with it. 

*Disclaimer: I try to write in a humourous way for myself, so please don't take my writing style too seriously.* 

In [None]:
#Importing the holy trinity of data science packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Other Visualization Packages
import seaborn as sns

In [None]:
#Loading in our data 
df_orders = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/orders.csv")
df_products = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/products.csv")
df_aisles = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/aisles.csv")
df_dept = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/departments.csv")

df_ord_pro_train = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/departments.csv")
df_ord_pro_prior = pd.read_csv("/kaggle/input/instacart-market-basket-analysis/order_products__prior.csv")

Great, now that we have loaded our data, let's take a look and see what we could do with these datasets.

In [None]:
df_orders.head()

This looks useful, We can ask the following questions? 
1. When are the peak hours of the day, do people order? 
2. How often do people order again on Instacart? 
3. Which days do people most often order their food? 

In [None]:
df_products.head()

In [None]:
df_aisles.head()

In [None]:
df_dept.head()

For the last three datasets, we could possibly ammend the data to have df_product, show cases the name of the aisle and department for each product. It will be easier to view, instead of just numbers. However, when modeling, using numbers are better than modeling with strings. 

In [None]:
df_ord_pro_train.head()

Seems like a repeated data set? Possibly. 

In [None]:
df_ord_pro_prior.head()

Great, we can create a basket dataset from this. 

# Part 1 - Exploratory Data Analysis (EDA)

Let's start by answering these questions. 
1. When are the peak hours of the day, do people order? 
2. How often do people order again on Instacart? 
3. Which days do people most often order their food? 

In [None]:
plt.figure(figsize = (14,7))
sns.countplot(x='order_hour_of_day', data= df_orders)
plt.title('Number of Orders Taken by Hour of the Day.')
plt.ylabel('Count')
plt.xlabel('Hour')
plt.show()

**Interesting to note:**
* It seems most of the instacart orders happen during 9am to 5pm, normal working hours. I'm shocked, I expected more folks to order in the evenings when you're done from work. Because, I normally make my instacart orders in the evening usually between 9pm to 11pm. 

In [None]:
plt.figure(figsize = (14,7))
sns.countplot(x='order_dow', data= df_orders)
plt.title('Number of Orders Taken by Day of the Week.')
plt.ylabel('Count')
plt.xlabel('Day')
plt.show()

**Interesting to Note:**
* Assuming 0 is Sunday. 
* Makes sesnse that most of the orders are taken on Sunday and Monday, it's the end/beginning of one's week and your food may have run out. Time to order. 
    * Afterall, people who do use instacart are most likley making an order when their food in the fridge runs out. Well, im basing this insight on myself. I just order when I run out of food, and it is usually at the end of the week, where I open my fridge and I realize I have no food, I make that instacart order then order my uber eats. 

In [None]:
agg_dow_hour = df_orders.groupby(['order_hour_of_day', 'order_dow'])['order_number'].aggregate('count').reset_index()
agg_dow_hour = agg_dow_hour.pivot('order_hour_of_day','order_dow','order_number')

plt.figure(figsize =(14,7))
sns.heatmap(agg_dow_hour)
plt.title('Heatmap of orders for Hour of the day Vs. Day of the Week')
plt.show()

**Interesting to Note**
* We can see most orders happen on Sunday afternoons,and Monday mornings. Assuming Day 0 is Sunday, and Day 1 is Monday. 

In [None]:
plt.figure(figsize = (14,7))
sns.countplot(x='days_since_prior_order', data= df_orders)
plt.title('Days Since Prior Instacart Order')
plt.ylabel('Count')
plt.xlabel('Days')
plt.show()

**Interesting to Note** 
* I'm shocked that there were quite a bit of orders within the first 3 days? Do people forgot to add to their instacart? Or do they just have smaller baskets instead of larger baskets? 
* The 7 day mark makes the most sense, since I usually order my instacart every 7 days.
* ALso note, we can see mini weekly peaks on days 14, 21,and 28. Which is interesting to me.
* Let's be aware that day 30 has a spike, and this is probably a limitation of the data clumping day 30 as all prior orders greater than 30 days. 

# Part 2 - Data Manipulation

Now let us append some of the different dataframes together to make it easier to work with.

What are we going to do? 
1. Append the df_products, df_aisles, df_departments together to df_ord_pro_prior

In [None]:
df_ord_pro_prior = pd.merge(df_ord_pro_prior, df_products, on = 'product_id', how = 'left')
df_ord_pro_prior = pd.merge(df_ord_pro_prior, df_aisles, on = 'aisle_id', how = 'left')
df_ord_pro_prior = pd.merge(df_ord_pro_prior, df_dept, on = 'department_id', how='left')

df_ord_pro_prior = df_ord_pro_prior.drop(['product_id', 'aisle_id', 'department_id'], axis = 1)

df_2 = df_ord_pro_prior.copy()

df_2.head(10)

This is more like it! :) 

Now let us look into: 
1. Top 10 Producs that people are ordering
2. Top Aisle people are shopping
3. Top departments too

In [None]:
print(f'Thee are {df_2.product_name.nunique()} unique products sold on Instacart! Wow!')

In [None]:
top15_products = df_2.product_name.value_counts()[:15]

In [None]:
plt.figure(figsize = (14,7))
top15_products.plot(kind = 'bar', color = 'limegreen')
plt.title('Top 15 Products sold on Instacart', fontsize = 20)
plt.ylabel('Count')
plt.xlabel('Product Name')
plt.xticks(rotation = 30)
plt.show()

**Interesting to Note:**
* This is really cool, folks love their bananas. 
* Lots of fruit, lots of organic food buyers
* Maybe those folks who use Instacart are generally more health concious folks. Or maybe normal people in general just consume lots of fruit. 
* Also folks generally buy multiple fruits at one time, for example myself I would buy 6 banans, wich eventually equals to a bunch of bananas. 

In [None]:
top15_aisles = df_2.aisle.value_counts()[:15]

In [None]:
plt.figure(figsize = (14,6))
sns.barplot(top15_aisles.index, top15_aisles.values, color = 'deepskyblue')
#Turns of scientic Notation in plot
plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)
plt.title('Top 15 Aisles Shopped on Instacart', fontsize = 20)
plt.ylabel('Count')
plt.xlabel('Aisle Name')
plt.xticks(rotation = 45)
plt.show()

In [None]:
dept_fre_count = df_2.department.value_counts()
dept_percentage = np.array(dept_fre_count/ dept_fre_count.sum())*100
dept_name = np.array(dept_fre_count.index)


In [None]:
plt.figure(figsize = (12,12))
plt.pie(dept_percentage, labels = dept_name, autopct = '%1.1f%%')
plt.title('Pie Chart for the vairous departments')
plt.show()