 # Instacart Market Basket Analysis

Let's Start with Importing the Modules 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
color = sns.color_palette()

%matplotlib inline

List out the files and their size that are present in directory 

In [None]:
INPUT_FOLDER='/Users/pd186040/Documents/Kaggle/Instacart/'
print ('File Sizes:')
for f in os.listdir(INPUT_FOLDER):
    if 'zip' not in f:
       print (f.ljust(30) + str(round(os.path.getsize(INPUT_FOLDER +  f) / 1000, 2)) + ' KB')

Let us first read all the files as dataframe objects and then look at the top few rows.

In [None]:
order_products_train_df = pd.read_csv("/Users/pd186040/Documents/Kaggle/Instacart/order_products__train.csv")
order_products_prior_df = pd.read_csv("/Users/pd186040/Documents/Kaggle/Instacart/order_products__prior.csv")
orders_df = pd.read_csv("/Users/pd186040/Documents/Kaggle/Instacart/orders.csv")
products_df = pd.read_csv("/Users/pd186040/Documents/Kaggle/Instacart/products.csv")
aisles_df = pd.read_csv("/Users/pd186040/Documents/Kaggle/Instacart/aisles.csv")
departments_df = pd.read_csv("/Users/pd186040/Documents/Kaggle/Instacart/departments.csv")

In [None]:
print("The orders_df size is :", orders_df.shape)

In [None]:
orders_df.head(20)

As we could see, orders.csv has all the information about the given order id like the user who has purchased the order, when was it purchased, days since prior order and so on.

We can also note that there is a column in orders.csv file called eval_set which tells us as to which of the three datasets (prior, train or test) the given row goes to.

In [None]:
print("The order_products_prior_df size is : ", order_products_prior_df.shape)

In [None]:
order_products_prior_df.head()

In [None]:
print("The order_products_train_df size is : ", order_products_train_df.shape)

In [None]:
order_products_train_df.head()

The columns present in order_products_train and order_products_prior are same. Then what is the difference between these files.?

As mentioned earlier, in this dataset, 4 to 100 orders of a customer are given (we will look at this later) and we need to predict the products that will be re-ordered. So the last order of the user has been taken out and divided into train and test sets. All the prior order informations of the customer are present in order_products_prior file. 

In [None]:
print("The products_df size is :", products_df.shape)

In [None]:
products_df.head()

In [None]:
print("The aisles_df size is :", aisles_df.shape)

In [None]:
aisles_df.head()

In [None]:
print("The departments_df size is :", departments_df.shape)

In [None]:
departments_df.head()

# Data Cleaning

In [None]:
#checking for missing values
total=orders_df.isnull().sum()
total

In [None]:
#checking for the percentage
percentage=total/orders_df.isnull().count()
percentage

In [None]:
missing_value_table_orders = pd.concat([total,percentage],keys=['Total','Percentage'],axis=1)
missing_value_table_orders

We can see that only 6% of days_since_prior_order column is null. So we can exclude them and use the data.

In [None]:
orders_df_new=orders_df[orders_df['days_since_prior_order'].notnull()]
orders_df_new.head()

Similarly, we check for missing values for all the other 5 data sets to clean the data.

In [None]:
#aisles
total_a=aisles_df.isnull().count()
total_a

In [None]:
percentage_a=total_a/aisles_df.isnull().count()
percentage_a

In [None]:
missing_value_table_aisles = pd.concat([total_a, percentage_a],keys=['Total','Percentage'],axis=1)
missing_value_table_aisles

In [None]:
#departments
total_d=departments_df.isnull().count()
total_d

In [None]:
percentage_d=total_d/departments_df.isnull().count()
percentage_d

In [None]:
missing_value_table_departments = pd.concat([total_d,percentage_d],keys=['Total','Percentage'],axis=1)
missing_value_table_departments

In [None]:
#orders_prior
total_order_p_p=order_products_prior_df.isnull().sum()
total_order_p_p

In [None]:
percentage_order_p_p=total_order_p_p/order_products_prior_df.isnull().count()
percentage_order_p_p

In [None]:
missing_value_table_order_p_p = pd.concat([total_order_p_p,percentage_order_p_p],keys=['Total','Percentage'],axis=1)
missing_value_table_order_p_p

In [None]:
#order_train
total_order_train=order_products_train_df.isnull().sum()
total_order_train

In [None]:
percentage_order_train=total_order_train/order_products_train_df.isnull().count()
percentage_order_train

In [None]:
missing_value_table_order_train = pd.concat([total_order_train,percentage_order_train],keys=['Total','Percentage'],axis=1)
missing_value_table_order_train

In [None]:
#products
total_products=products_df.isnull().sum()
total_products

In [None]:
percentage_products=total_products/products_df.isnull().count()
percentage_products

In [None]:
missing_value_table_products = pd.concat([total_products,percentage_products],keys=['Total','Percentage'],axis=1)
missing_value_table_products


Looking at the other 5 data sets we see that there are no missing values and hence conclude the data cleaning process

# Exploratory Data Analysis & Data Visualization



Let us first get the count of rows in each of the three sets.

In [None]:
def get_unique_count(x):
    return len(np.unique(x))


cnt_eval = orders_df.groupby("eval_set")["user_id"].aggregate(get_unique_count)
cnt_eval

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(cnt_eval.index, cnt_eval.values, alpha=0.8, color=color[1])
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Eval set type', fontsize=12)
plt.title('Count of rows in each dataset', fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

So there are 206,209 customers in total. Out of which, the last purchase of 131,209 customers are given as train set and we need to predict for the rest 75,000 customers.

In [None]:
count=orders_df['eval_set'].value_counts()
count

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(count.index, count.values)
plt.ylabel('Number of Occurrences in the dataset', fontsize=14)
plt.xlabel('Evaluation set type', fontsize=14)
plt.title('Eval_set breakdown in orders dataset', fontsize=16)

Now let us validate the claim that 4 to 100 orders of a customer are given.

In [None]:
cnt_orders = orders_df.groupby("user_id")["order_number"].aggregate(np.max).reset_index()
cnt_orders = cnt_orders.order_number.value_counts()


In [None]:
plt.figure(figsize=(12,8))
sns.barplot(cnt_orders.index, cnt_orders.values, color=color[4])
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Maximum order number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

So there are no orders less than 4 and is max at 100 as given in the data page

# Time of Order

Time at which people usually order products.

# Days of Orders in a week:

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="order_dow", data=orders_df, color=color[3])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.title("Frequency of order by week day", fontsize=15)
plt.show()

Seems like 0 and 1 is Saturday and Sunday when the orders are high

# Hour of Order in a Day:

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="order_hour_of_day", data=orders_df, color=color[5])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.title("Frequency of order by hour of day", fontsize=15)
plt.show()

People mostly order between 8 and 19 (probably between 8 a.m and 7 p.m.)

 Now let us combine the day of week and hour of day to see the distribution.

In [None]:
grouped_df = orders_df.groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
grouped_df.head()

In [None]:
grouped_df = grouped_df.pivot('order_dow', 'order_hour_of_day', 'order_number')
grouped_df

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(grouped_df)
plt.title("Frequency of Day of week Vs Hour of day")
plt.show()

Seems Satuday evenings and Sunday mornings are the prime time for orders.

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="days_since_prior_order", data=orders_df, color=color[2])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Days since prior order', fontsize=12)
plt.title("Frequency distribution by days since prior order", fontsize=15)
plt.show()

From this plot we can see that 7th day is where we have a spike, and then a relative small peak at days 14,21 and 28 which indicates that every 7 days or weekly is the order frequency. And then again there's a huge peak at the end of the month indicating that there's a monthly peak

Since our objective is to figure out the re-orders, let us check out the re-order percentage in prior set and train set.

In [None]:
# percentage of re-orders in orders_products_prior
print("Percent of reorders in prior set:") 
print(order_products_prior_df.reordered.sum() / len(order_products_prior_df))

In [None]:
# percentage of re-orders in orders_products_train
print("Percent of reorders in train set:") 
print(order_products_train_df.reordered.sum() / len(order_products_train_df))

On an average, about 59% of the products in an order are re-ordered products

Now let us merge these product details with the order_prior details.

In [None]:
#merging order_products_prior and products
order_products_prior_df_merged = pd.merge(order_products_prior_df, products_df, on='product_id', how='left')

#merging op_merged with aisles
order_products_prior_df_merged = pd.merge(order_products_prior_df_merged, aisles_df, on='aisle_id', how='left')

#merging the new op_prior_merged with departments
order_products_prior_df_merged = pd.merge(order_products_prior_df_merged, departments_df, on='department_id', how='left')

In [None]:
order_products_prior_df_merged.head()

# Most ordered Products
Now let's identify which products are ordered the most.

In [None]:
cnt_srs = order_products_prior_df_merged['product_name'].value_counts().reset_index().head(10)
cnt_srs.columns = ['product_name', 'frequency_count']
cnt_srs

In [None]:
cnt_srs = cnt_srs.groupby(['product_name']).sum()['frequency_count'].sort_values(ascending=False)
sns.set_style('darkgrid')
f, ax = plt.subplots(figsize=(12, 10))
sns.barplot(cnt_srs.index, cnt_srs.values)
plt.xticks(rotation='vertical')
plt.ylabel('Number of Reorders', fontsize=13)
plt.xlabel('Most ordered Products', fontsize=13)
plt.show()

# Aisles:

Now let us look at the important aisles.

In [None]:
cnt_aisle = order_products_prior_df_merged['aisle'].value_counts().head(20)
cnt_aisle

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(cnt_aisle.index, cnt_aisle.values, alpha=0.8, color=color[5])
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Aisle', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

From this graph we can see that the fresh food and fresh vegetables aisles are the most frequently visited. We can do the same analysis for department

# Department Distribution:

Let us now check the department wise distribution.

In [None]:
cnt_aisle = order_products_prior_df_merged['department'].value_counts().head(20)
cnt_aisle

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(cnt_aisle.index, cnt_aisle.values, alpha=0.8, color=color[2])
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Departments', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

From the graph we can see that the department wise frequency is more for produce which aligns with the aisles frequency and then for dairy eggs.

# Most important Aisles in each Department

In [None]:
grouped =order_products_prior_df_merged.groupby(["department", "aisle"])["product_id"].aggregate({'Total_products': 'count'}).reset_index()
grouped.sort_values(by='Total_products', ascending=False, inplace=True)
grouped.head()

In [None]:
fig, axes = plt.subplots(7,3, figsize=(20,45), gridspec_kw =  dict(hspace=1.4))
for (aisle, group), ax in zip(grouped.groupby(["department"]), axes.flatten()):
    g = sns.barplot(group.aisle, group.Total_products , ax=ax)
    ax.set(xlabel = "Aisles", ylabel=" Number of products")
    g.set_xticklabels(labels = group.aisle,rotation=90, fontsize=12)
    ax.set_title(aisle, fontsize=15)

# Reorders:

In [None]:
#merge order_product_prior with orders 
merged_reorders = pd.merge(order_products_prior_df, orders_df, on='order_id', how='left')
merged_reorders.head()

In [None]:
count_reordered = merged_reorders['reordered'].value_counts()
count_reordered

In [None]:
plt.figure(figsize=(6,12))
sns.barplot(count_reordered.index, count_reordered.values)
plt.ylabel('Frequencies', fontsize=14)
plt.xlabel('Reordered', fontsize=4)
plt.show()

In [None]:
#finding reorders against day of the week
grouped_reorders_dow = merged_reorders.groupby(["order_dow"])["reordered"].aggregate("count").reset_index()
grouped_reorders_dow

In [None]:
plt.figure(figsize=(6,12))
sns.barplot(grouped_reorders_dow.order_dow, grouped_reorders_dow.reordered)
plt.ylabel('Total number of reordered products', fontsize=14)
plt.xlabel('order_day_of_week', fontsize=14)
plt.show()

From this graph, we can see that most products are reordered on Saturday followed by Sunday and Friday. Which follows the same trend as orders placed over the week.

In [None]:
#finding reorders against hour of the day
grouped_reorders = merged_reorders.groupby(["order_hour_of_day"])["reordered"].aggregate("count").reset_index()
grouped_reorders

In [None]:
plt.figure(figsize=(12,12))
sns.barplot(grouped_reorders.order_hour_of_day, grouped_reorders.reordered)
plt.ylabel('Total number of reordered products', fontsize=14)
plt.xlabel('order_hour_of_day', fontsize=14)
plt.show()

This graph shows that most products are reordered from 9am-5pm. This aligns with the number of products ordered during the week and the weekends.

In [None]:
merged1 = pd.merge(order_products_train_df, orders_df, on='order_id', how='left')
merged1.head()

In [None]:
df_merged1 = pd.merge(merged1, products_df, on='product_id', how='left')
df_merged1.head()

In [None]:
#merging all the datasets to get a final train dataset
df = pd.merge(df_merged1, departments_df, on='department_id', how='left')
df.head()

In [None]:
df_new = df.copy()
df_new.head()

In [None]:
del df['eval_set']

In [None]:
del df['add_to_cart_order']

In [None]:
df.head()

# Algo

In [None]:

#Variable to be predicted
y=df['reordered']

In [None]:
del df['reordered']
del df['product_name']
del df['department']

In [None]:
df.head()

In [None]:

from sklearn.model_selection import train_test_split

In [None]:
Xtr, Xtest, ytr, ytest = train_test_split(df, y, test_size=0.30, random_state=5)


In [None]:
Xtr.shape


In [None]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [None]:

#Logistic Regression model
clf=(LogisticRegression(C=0.02))

In [None]:
#fitting the model
clf.fit(Xtr, ytr)

In [None]:
#predictions
pred=clf.predict(Xtest)

In [None]:
pred

In [None]:
#accuracy score of Logistic Regression Model
print(accuracy_score(clf.predict(Xtest), ytest))