# https://towardsdatascience.com/neural-networks-for-your-groceries-f0a643eb411

https://nbviewer.jupyter.org/github/stevhliu/instacart-neural-network/blob/master/instacart%20neural%20network.ipynb

In [36]:
import pandas as pd
import numpy as np

%matplotlib inline
plt.style.use('seaborn')

warnings.filterwarnings(action='ignore')

In [37]:
a = pd.read_csv("Data/instacart_2017_05_01/aisles.csv")
a.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [38]:
p = pd.read_csv("Data/instacart_2017_05_01/products.csv")
p.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [39]:
d = pd.read_csv("Data/instacart_2017_05_01/departments.csv")
d.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [40]:
catalog = pd.merge(p, a, how="right", on='aisle_id')
catalog.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,cookies cakes
2,102,Danish Butter Cookies,61,19,cookies cakes
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,cookies cakes
4,285,Mini Nilla Wafers Munch Pack,61,19,cookies cakes


In [41]:
catalog = pd.merge(catalog, d,how='right', on="department_id")
catalog.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,cookies cakes,snacks
2,102,Danish Butter Cookies,61,19,cookies cakes,snacks
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,cookies cakes,snacks
4,285,Mini Nilla Wafers Munch Pack,61,19,cookies cakes,snacks


In [42]:
# read data for piors order
o = pd.read_csv("Data/instacart_2017_05_01/orders.csv")
o.head()


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [43]:
X = pd.read_csv("Data/instacart_2017_05_01/order_products__prior.csv")
X.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [44]:
prior_order = pd.merge(o,X, how='right', on ='order_id')
prior_order.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,0
1,2539329,1,prior,1,2,8,,14084,2,0
2,2539329,1,prior,1,2,8,,12427,3,0
3,2539329,1,prior,1,2,8,,26088,4,0
4,2539329,1,prior,1,2,8,,26405,5,0


In [45]:
prior_order.shape

(32434489, 10)

In [46]:
X = pd.merge(catalog,prior_order, how='right', on ='product_id')
X.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,3139998,138,prior,28,6,11,3.0,5,0
1,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,1977647,138,prior,30,6,17,20.0,1,1
2,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,389851,709,prior,2,0,21,6.0,20,0
3,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,652770,764,prior,1,3,13,,10,0
4,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,1813452,764,prior,3,4,17,9.0,11,1


In [47]:
# downsample the original datasets to 1 million records
X = X.iloc[0:1000000,]


# Data Wrangling


In [48]:
X.isna().sum()

product_id                    0
product_name                  0
aisle_id                      0
department_id                 0
aisle                         0
department                    0
order_id                      0
user_id                       0
eval_set                      0
order_number                  0
order_dow                     0
order_hour_of_day             0
days_since_prior_order    63073
add_to_cart_order             0
reordered                     0
dtype: int64

In [49]:
X.replace(np.NaN, 0, inplace = True)

In [50]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 15 columns):
product_id                1000000 non-null int64
product_name              1000000 non-null object
aisle_id                  1000000 non-null int64
department_id             1000000 non-null int64
aisle                     1000000 non-null object
department                1000000 non-null object
order_id                  1000000 non-null int64
user_id                   1000000 non-null int64
eval_set                  1000000 non-null object
order_number              1000000 non-null int64
order_dow                 1000000 non-null int64
order_hour_of_day         1000000 non-null int64
days_since_prior_order    1000000 non-null float64
add_to_cart_order         1000000 non-null int64
reordered                 1000000 non-null int64
dtypes: float64(1), int64(10), object(4)
memory usage: 122.1+ MB


In [51]:
# convert string to int

X['days_since_prior_order'] = X['days_since_prior_order'].astype(int)



In [52]:
# convert integers to categorical variables
int_to_cat = ['product_id', 'reordered', 'order_dow', 
              'order_hour_of_day', 'aisle_id', 'department_id',
              'days_since_prior_order']

for i in int_to_cat:
    X[i] = X[i].astype(str)
    

In [53]:
# drop unrelated columns

X.drop(['user_id', 'eval_set'], axis=1, inplace=True)

In [54]:
# import squarify to create a treemap
import squarify
import matplotlib

# get number of products in each department
x1 = (catalog.groupby(['department'])
      .count()
      .reset_index())

# generate labels with number of products in each department
labels = x1.apply(lambda x: str(x[0]) + '\n (' + str(x[1]) + ')', axis=1)

# create plot
plt.figure(figsize=(15, 8), dpi=200)

# plot data and add padding around squares for better visualization
squarify.plot(label=labels, sizes=x1.product_id, color='#43B02A',
              text_kwargs={'fontsize':'large', 'color':'white'},
              bar_kwargs={'edgecolor': 'w', 'linewidth':5})
plt.title('Instacart catalog', fontsize=16)

# remove axes and display plot
plt.axis('off')
plt.show()

ModuleNotFoundError: No module named 'squarify'

In [57]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# create dataframe for number of orders per hour of day for each department
x2 = (X.groupby(['department', 'order_hour_of_day'])
      .count()
      .reset_index())

# manipulate dataframe to keep only department name, order hour of day and number of orders
x2 = (x2.loc[:,'department':'order_id']
     .rename(columns={'order_id':'number_of_orders'}))

x2.order_hour_of_day = x2.order_hour_of_day.astype('int')

# create plot
sns.set_style('white')
plt.figure(figsize=(15, 8), dpi=200)

# plot data
sns.lineplot(x='order_hour_of_day', y='number_of_orders',  
             hue='department', linewidth=1.5, data=x2)

# set legend outside the plot             
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, fontsize=12)
plt.title('Purchasing traffic', fontsize=16)
plt.xlabel('Hour of day', fontsize=12)
plt.ylabel('Number of purchases', fontsize=12)


AttributeError: module 'seaborn' has no attribute 'lineplot'

<Figure size 3000x1600 with 0 Axes>

In [58]:
# drop redundant variables
drop = ['product_name', 'department', 'aisle', 'order_id']

[X.drop(x, axis=1, inplace=True) for x in drop]

# create feature and target variables
y = X['reordered']
X = X.drop('reordered', axis=1)

# encode categorical features
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical

enc = OneHotEncoder()
X = enc.fit_transform(X)
y = to_categorical(y)

Using TensorFlow backend.


In [None]:
from sklearn.decomposition import TruncatedSVD

# start with 600 components
svd600 = TruncatedSVD(n_components=600).fit(X)

# plot number of components vs explained variance
plt.figure(figsize=(8,5), dpi=100)
sns.set_style('whitegrid')

plt.plot(np.cumsum(svd600.explained_variance_ratio_), color='#ff8200')
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')