# Exploratory Data Analysis and Market Basket Analysis on Instacart Dataset



Importing necessary libraries and modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Data Exploration

Importing all files as Dataframe objects

In [None]:
aisles = pd.read_csv("../input/dataset/aisles.csv")
departments = pd.read_csv("../input/dataset/departments.csv")
order_products_train = pd.read_csv("../input/dataset/order_products__prior.csv")
order_products_test = pd.read_csv("../input/dataset/order_products__train.csv")
orders = pd.read_csv("../input/dataset/orders.csv")
products = pd.read_csv("../input/dataset/products.csv")
order_products_prior= pd.read_csv("../input/dataset/order_products__prior.csv")

Now we will explore each dataframe further for better understanding.

**Aisles**

In [None]:
aisles.head()

In [None]:
aisles.info()

In [None]:
aisles.describe()

In [None]:
aisles["aisle"]

**Departments**

In [None]:
departments.head()

In [None]:
departments.info()

In [None]:
departments.describe()

**order_products_train**

In [None]:
order_products_train.head()

In [None]:
order_products_train.info()

In [None]:
order_products_train.describe()

In [None]:
order_products_train.isnull().sum()

**Orders**

In [None]:
orders.head()

In [None]:
orders.info()

In [None]:
orders['days_since_prior_order'].unique()

In [None]:
orders.describe()

In [None]:
orders.isnull().sum()

In [None]:
orders.fillna(0,inplace=True)

In [None]:
orders[orders["days_since_prior_order"]=="NA"]

**Products**

In [None]:
products.head()

In [None]:
products.info()

In [None]:
products.describe()

# Data Visualisation

**The above textual data can be understood better with the help of graphs.**

Now let us see how the ordering habit changes with each day of the week.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="order_dow", data=orders, color=color[0])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by week day", fontsize=15)
plt.show()

From the above graph we can see that there is a hike in the number of orders on the weekend i.e Saturday and Sunday, the least number of orders were done on Wednesday.

Now we shall see how the distribution is with respect to time of the day.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="order_hour_of_day", data=orders, color=color[0])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of order by hour of day", fontsize=15)
plt.show()

Now we will combine the day of week and hour of day to see the distribution with the help of a heatmap

In [None]:
grouped_df = orders.groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
grouped_df = grouped_df.pivot('order_dow', 'order_hour_of_day', 'order_number')

plt.figure(figsize=(15,5))
sns.heatmap(grouped_df)
plt.title("Frequency of Day of week Vs Hour of day")
plt.show()

From the heatmap its evident that most orders are done on Saturday and Sunday mornings (9am to 5pm)

Now let us check the time interval between orders.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="days_since_prior_order", data=orders, color=color[0])
plt.ylabel('Count', fontsize=12)
plt.xlabel('Days since prior order', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency distribution by days since prior order", fontsize=15)
plt.show()

From the graph we see a spike every 7 days and 30 days.Thus we can conclude most customers order on a weekly or monthly basis. We can also see smaller peaks at 14, 21 and 28 days (weekly intervals). Another explaination for the spike after 30 days is because any order that is made more than 30 days since the previous order is declared as 30 days.



Since our objective is to figure out the re-orders, let us check out the re-order percentage in prior set and train set

In [None]:
# percentage of re-orders in prior set #
order_products_prior.reordered.sum() / order_products_prior.shape[0]

In [None]:
# percentage of re-orders in train set #
order_products_train.reordered.sum() / order_products_train.shape[0]

There is a 59% of reorders in both train and prior sets.

 There will also be situations when none of the products are re-ordered. Let us check that now in prior and test sets.

In [None]:
grouped_df = order_products_prior.groupby("order_id")["reordered"].aggregate("sum").reset_index()
grouped_df["reordered"][grouped_df["reordered"]>1] = 1
grouped_df.reordered.value_counts() / grouped_df.shape[0]

In [None]:
grouped_df = order_products_train.groupby("order_id")["reordered"].aggregate("sum").reset_index()
grouped_df["reordered"][grouped_df["reordered"]>1] = 1
grouped_df.reordered.value_counts() / grouped_df.shape[0]

About 12% of the orders in prior set has no re-ordered items while in the train set it is 6.5%.

Now let us see the number of products bought in each order.

In [None]:
grouped_df = order_products_train.groupby("order_id")["add_to_cart_order"].aggregate("max").reset_index()
cnt_srs = grouped_df.add_to_cart_order.value_counts()

plt.figure(figsize=(20,10))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color=color[0])
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Number of products in the given order', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

The highest number of products ordered peaks at 5 and gradually reduces to the right.

For a more meaningful understanding the data at hand, we will merge the aisles, product and department dataframes and explore the merged dataframe further.

In [None]:
order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='left')
order_products_prior = pd.merge(order_products_prior, aisles, on='aisle_id', how='left')
order_products_prior = pd.merge(order_products_prior, departments, on='department_id', how='left')
order_products_prior.head()

Now lets see the most bought items.

In [None]:
cnt_srs = order_products_prior['product_name'].value_counts().reset_index().head(20)
cnt_srs.columns = ['product_name', 'frequency_count']
cnt_srs

The most frequently bought items are organic fruits and vegetables. 

Now lets explore the most frequently bought aisles section.

In [None]:
cnt_srs = order_products_prior['aisle'].value_counts().head(20)
plt.figure(figsize=(15,5))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=1.0, color=color[0])
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Aisle', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

This data is in tandem with the frequently bought items table.

Now let us check the Department distribution with the help of a piechart.

In [None]:
plt.figure(figsize=(15,10))
temp_series = order_products_prior['department'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))
plt.pie(sizes, labels=labels, 
        autopct='%1.1f%%', startangle=200)
plt.title("Departments distribution", fontsize=15)
plt.show()

Produce, dairy eggs and snacks constitute  majority of department.

Now let us check the reordered percentage of each department with the help of pointplot.

In [None]:
grouped_df = order_products_prior.groupby(["department"])["reordered"].aggregate("mean").reset_index()

plt.figure(figsize=(15,5))
sns.pointplot(grouped_df['department'].values, grouped_df['reordered'].values, alpha=1.0, color=color[0])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Department', fontsize=12)
plt.title("Department wise reorder ratio", fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

Personal care has lowest reorder ratio and dairy eggs have highest reorder ratio.

Now let us check the reordered percentage of each aisle.

In [None]:
grouped_df = order_products_prior.groupby(["department_id", "aisle"])["reordered"].aggregate("mean").reset_index()

fig, ax = plt.subplots(figsize=(15,15))
ax.scatter(grouped_df.reordered.values, grouped_df.department_id.values)
for i, txt in enumerate(grouped_df.aisle.values):
    ax.annotate(txt, (grouped_df.reordered.values[i], grouped_df.department_id.values[i]), rotation=45, ha='center', va='center', color='blue')
plt.xlabel('Reorder Ratio')
plt.ylabel('department_id')
plt.title("Reorder ratio of different aisles", fontsize=15)
plt.show()

Let us now explore the relationship between how aadding to cart affects the reorder ratio.

In [None]:
order_products_prior["add_to_cart_order_mod"] = order_products_prior["add_to_cart_order"].copy()
order_products_prior["add_to_cart_order_mod"][order_products_prior["add_to_cart_order_mod"]>70] = 70
grouped_df = order_products_prior.groupby(["add_to_cart_order_mod"])["reordered"].aggregate("mean").reset_index()

plt.figure(figsize=(15,10))
sns.pointplot(grouped_df['add_to_cart_order_mod'].values, grouped_df['reordered'].values, alpha=1.0, color=color[0])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Add to cart order', fontsize=12)
plt.title("Add to cart order - Reorder ratio", fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

From the graph we see that the items added to cart initially has high reorder ratio compared to the items added later on.

Now let us check the relation between reorder ratio and days of the week.

In [None]:
order_products_train = pd.merge(order_products_train, orders, on='order_id', how='left')
grouped_df = order_products_train.groupby(["order_dow"])["reordered"].aggregate("mean").reset_index()

plt.figure(figsize=(15,5))
sns.barplot(grouped_df['order_dow'].values, grouped_df['reordered'].values, alpha=0.8, color=color[0])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.title("Reorder ratio across day of week", fontsize=15)
plt.xticks(rotation='vertical')
plt.ylim(0.5, 0.7)
plt.show()

We can see that reordersare high on Sundays and Thursdays.

Now let us check the relation between reorder ratio and hours of the day.

In [None]:
grouped_df = order_products_train.groupby(["order_hour_of_day"])["reordered"].aggregate("mean").reset_index()

plt.figure(figsize=(15,5))
sns.barplot(grouped_df['order_hour_of_day'].values, grouped_df['reordered'].values, alpha=1.0, color=color[0])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.title("Reorder ratio across hour of day", fontsize=15)
plt.xticks(rotation='vertical')
plt.ylim(0.5, 0.7)
plt.show()

Finally lets will combine the day of week and hour of day to see the reorder distribution with the help of a heatmap

In [None]:
grouped_df = order_products_train.groupby(["order_dow", "order_hour_of_day"])["reordered"].aggregate("mean").reset_index()
grouped_df = grouped_df.pivot('order_dow', 'order_hour_of_day', 'reordered')

plt.figure(figsize=(15,5))
sns.heatmap(grouped_df)
plt.title("Reorder ratio of Day of week Vs Hour of day")
plt.show()

From the heatmap we can conclude that most of the reorders are done during early mornings.

In [None]:
goods = pd.merge(left=pd.merge(left=products, right=departments, how='left'), right=aisles, how='left')
goods.head()

In [None]:
goods.info()

In [None]:
customer_orders =pd.merge(left=orders,right=order_products_train,on="order_id",how='left')


In [None]:
customer_orders.isnull().sum()

In [None]:
len(orders['user_id'])
orders['user_id'][191]

In [None]:
'''customer_entered =[]
i = 1
j = 0
sum = 0
user_id = orders['user_id'][j]
while i<len(orders['user_id']):
    if orders['user_id'][j+i] == orders['user_id'][j]:
        sum += 1
    else:
        customer_entered.append(sum)
        sum = 0
    j +=1
    i +=1
print(customer_entered)   
'''

In [None]:
order = orders[["order_id","user_id","order_number","days_since_prior_order"]]
order.head()

In [None]:
print(len(order['order_id']))
print(len(list(order['order_id'].unique())))
print(len(order_products_train['order_id']))
print(len(list(order_products_train['order_id'].unique())))
print(len(order_products_test['order_id'].unique()))  

In [None]:
orders1= order.groupby(['user_id']).sum()
orders1.head()

In [None]:
orders2 = order.groupby(['user_id']).count()
orders2.head()

In [None]:
final_orders = pd.DataFrame(orders1/(orders2))
final_orders = final_orders.drop(["order_id","order_number"],axis = 1)
final_orders.head()

In [None]:
final1 = pd.DataFrame(orders2['order_number'])
final1.head()

In [None]:
frames = [final_orders,final1]
final = pd.concat(frames,axis =1)
final.head()

In [None]:
final.columns = ['average_days_since_prior_order','number_of_visits']
final.reset_index(level=0, inplace=True)
final.head()

In [None]:
otrain1 = order_products_train.groupby(['order_id']).sum()
otrain1.reset_index(level=0, inplace=True)
otrain1.head()

In [None]:
otest1 = order_products_test.groupby(['order_id']).sum()
otest1.reset_index(level=0, inplace=True)
otest1.head()

In [None]:
print(len(orders['order_id'])-(len(otrain1['order_id'])+len(otest1['order_id'])))

In [None]:
frames =[otrain1,otest1]
train_test = pd.concat(frames,axis = 0)
train_test.head()

In [None]:
tables = pd.merge(left = orders , right = train_test , how = "left")
tables.head()

In [None]:
tables[tables["user_id"]==4]

In [None]:
tables.isna().sum()
#tables.to_csv("tables.csv")

In [None]:
actual_test = orders[orders['eval_set']== 'test']
actual_test
#user = actual_test["order_id"]
#user

In [None]:
#tables.drop(tables[tables['eval_set']== 'test'] ,axis =0)
tables.drop(tables.loc[tables['eval_set']=='test'].index, inplace=True)
tables
#tables[tables['eval_set']== 'test']

In [None]:
final

In [None]:
tables

In [None]:
semi_final1 = tables.groupby(['user_id']).sum()
semi_final1.reset_index(level=0, inplace=True)
#semi_fianl.drop("order_id",ore)
semi_final1.head()

In [None]:
semi_final2 = tables.groupby(['user_id']).count()
semi_final2.reset_index(level=0, inplace=True)
semi_final2

In [None]:
'''edit1 = pd.DataFrame(orders1/(orders2))
final_orders = final_orders.drop(["order_id","order_number"],axis = 1)
final_orders'''

In [None]:
#frames =  []
#last_final = 

# NATURAL LANGUAGE PROCESSING


In [None]:
#Reading the csv file and storing the elements in an array
import nltk
import re
import numpy as np
import pandas as pd

dataset = pd.read_csv("../input/dataset/aisles.csv")

dataset = np.array(dataset)[ : ,-1]
dataset

In [None]:
#Creating the Bag of Words model
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1
word2count

In [None]:
#Heap of 50 frequently used words
import heapq
freq_words = heapq.nlargest(50, word2count, key=word2count.get)
freq_words

In [None]:
#Building the Bag of Words model
x=[]
for data in dataset:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    x.append(vector)
x = np.asarray(x)
x

In [None]:
#Tokenizing all the words present 
all_words = [nltk.word_tokenize(data) for data in dataset]

all_words    

In [None]:
#Eliminating the stop words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [None]:
#Word Embedding using Word2Vec
from gensim.models import Word2Vec
word2vec = Word2Vec(all_words, min_count = 1)


In [None]:
vocabulary = word2vec.wv.index_to_key
print(vocabulary)

In [None]:
#Representation of the vector 'fruits'
v1 = word2vec.wv['fruits']
v1

In [None]:
#Obtaining the word vectors with the meaning similar to the word vector'food'
sim_words= word2vec.wv.most_similar('food')
sim_words

In [None]:
#Obtaining TF-IDF vector 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(dataset)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

# **MARKET BASKET ANALYSIS**

In [None]:
merged_df=order_products_prior.merge(orders,on="order_id")
merged_df=merged_df.merge(products,on="product_id")
merged_df=merged_df.merge(aisles,on="aisle_id")
merged_df=merged_df.merge(departments,on="department_id")

In [None]:
merged_df.head()

In [None]:
merged_df=merged_df.iloc[:350000]

In [None]:
merged_df["product_id_str"]=merged_df["product_id"].astype(str)

In [None]:
merged_df.head()

In [None]:
g=merged_df.groupby("order_id")["product_name_x"]

In [None]:
g1=g.apply(lambda x: ','.join(x.str.replace(","," ")))

In [None]:
# **OLD TRY**

In [None]:
order_products_train = pd.read_csv("../input/dataset/order_products__train.csv")
products = pd.read_csv("../input/dataset/products.csv")

In [None]:
group_df=pd.merge(left=products, right=order_products_train, on="product_id", how="left")

In [None]:
group_df

In [None]:
#Data Cleaning
group_df['product_name'] = group_df['product_name'].str.strip() #removes spaces from beginning and end
group_df.dropna(axis=0, subset=['product_id'], inplace=True) #removes duplicate invoice
group_df['product_id'] = group_df['product_id'].astype('str') #converting invoice number to be string
group_df.head()

In [None]:
group_df['product_name'].value_counts()


In [None]:
group_df.info()

In [None]:
group_df=group_df.iloc[:3500000]

In [None]:
group_df

In [None]:
def split_dataframe(df, chunk_size=30000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [None]:
group_df_split = split_dataframe(group_df, 12000)

In [None]:
group_df_split[0]

In [None]:
group_df_split[1]

In [None]:
group_df_split[2]

In [None]:
count_df=group_df_split[0].product_id.value_counts()
count_df=pd.DataFrame(count_df)
count_df.reset_index(inplace=True)
count_df.columns = ["product_id","count"]
count_products=count_df.merge(products,on="product_id")
count_products.head()

In [None]:
basket_0 = (group_df_split[0]
          .groupby(['product_id', 'product_name'])['add_to_cart_order']
          .sum().unstack().reset_index().fillna(0)
          .set_index('product_id'))


In [None]:
basket_0

In [None]:
#converting all positive vaues to 1 and everything else to 0
def my_encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
basket_sets_0= basket_0.applymap(my_encode_units)

In [None]:
basket_sets_0

In [None]:
#Generatig frequent itemsets
frequent_itemsets = apriori(basket_sets_0, min_support=0.0001, use_colnames=True)

In [None]:
frequent_itemsets

In [None]:
from mlxtend.frequent_patterns import association_rules

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

In [None]:
rules.head()

In [None]:
rules[ (rules['lift'] >= 3) &
       (rules['confidence'] >= 0.3) ]