In [1]:
#importing required libraries
import pandas as pd
import numpy as np
import glob
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest

In this section, we will be applying statistical tools to gain some inferences and insights into the kind of data we are dealing with and disovering relationships between various features of our dataset.

- As per the data storytelling report, it was noticed that items that the reorder rate was higher for the items that were added to the cart first upto the item 16.
- It was also noticed that organic, fat-free, gluten-free items had more reorder rate than other items

To begin, let us check if there is any relationship between items added to the cart sequence and reorder rate. As the reordered is a binary variable, we will use z test to test the significance.

In [2]:
import os

In [3]:
os.getcwd()

'/data/7102'

In [4]:
os.chdir('/data/7102/datasets')

In [5]:
#merged dataset
df = pd.read_csv('total1.csv')

In [6]:
df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,0,196,1,0,Soda,77,7,soft drinks,beverages
1,2539329,1,prior,1,2,8,0,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,soy lactosefree,dairy eggs
2,2539329,1,prior,1,2,8,0,12427,3,0,Original Beef Jerky,23,19,popcorn jerky,snacks
3,2539329,1,prior,1,2,8,0,26088,4,0,Aged White Cheddar Popcorn,23,19,popcorn jerky,snacks
4,2539329,1,prior,1,2,8,0,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household


In [7]:
df_reorder = df[['order_id', 'add_to_cart_order', 'reordered']]

In [8]:
df_reorder.head()

Unnamed: 0,order_id,add_to_cart_order,reordered
0,2539329,1,0
1,2539329,2,0
2,2539329,3,0
3,2539329,4,0
4,2539329,5,0


In [9]:
df_reorder['add_to_cart_order'].value_counts()

1      3346083
2      3182490
3      2988129
4      2773069
5      2542770
6      2305545
7      2069162
8      1840615
9      1629258
10     1437694
11     1265591
12     1112096
13      975133
14      853419
15      745342
16      649867
17      565153
18      490685
19      425337
20      368047
21      317849
22      274129
23      236080
24      202946
25      174589
26      149897
27      128628
28      110280
29       94499
30       80832
        ...   
116          5
117          4
118          4
119          4
120          4
121          4
124          3
125          3
127          3
126          3
123          3
122          3
128          2
129          2
130          2
131          2
132          2
133          2
134          2
135          2
136          2
137          2
143          1
141          1
142          1
144          1
140          1
139          1
138          1
145          1
Name: add_to_cart_order, dtype: int64

In [10]:
df_reorder['reordered'].value_counts()

1    19955360
0    13863746
Name: reordered, dtype: int64

##### Hypothesis Testing

- Null Hypothesis: There is no relationship between added to the cart order and reorder rate.
- Alternative Hypothesis: There is relationship between added to the cart order and reorder rate.

Significance Level \alpha = 0.05

In [11]:
reordered = df_reorder.loc[df_reorder['reordered']==1,]
not_reordered = df_reorder.loc[df_reorder['reordered']==0,]

In [12]:
reordered_mean = reordered.groupby('add_to_cart_order').size().to_frame('counts').reset_index()['counts'].mean()
not_reordered_mean = not_reordered.groupby('add_to_cart_order').size().to_frame('counts').reset_index()['counts'].mean()
reordered_mean, not_reordered_mean

(148920.59701492538, 95612.041379310351)

In [13]:
reordered_std = reordered.groupby('add_to_cart_order').size().to_frame('counts').reset_index()['counts'].std()
not_reordered_std = not_reordered.groupby('add_to_cart_order').size().to_frame('counts').reset_index()['counts'].std()
reordered_std, not_reordered_std

(425584.67718641658, 244003.89654313799)

In [14]:
mean_diff = reordered_mean-not_reordered_mean
sigma_diff = np.sqrt((reordered_std**2)/len(reordered)  + (not_reordered_std**2)/len(not_reordered))
mean_diff, sigma_diff

(53308.555635615034, 115.6325107310156)

In [15]:
z = (mean_diff ) / sigma_diff
z

461.0170210661725

In [16]:
p = (1-stats.norm.cdf(z))*2
p

0.0

The p-value obtained is lower than our chosen significance level. Therefore, we reject the null hypothesis. **It means that there is a relationship between added to the cart order and reorder rate**.

### Organic Vs reordered

Now lets check the relation ship between oeganic food and  reorder rate. As both the is_organic and reorderd are binary variables, we will use  Fisher exact test.

##### Hypothesis Testing

- Null Hypothesis: There is no relationship between organic food and reorder rate.
- Alternative Hypothesis: There is relationship between organic food and reorder rate.

Significance Level \alpha = 0.05

In [17]:
df_orgainc = df[['order_id', 'product_name', 'reordered']]

In [18]:
df_orgainc['is_organic']=df_orgainc['product_name'].str.contains('Organic')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
df_orgainc.head(5)

Unnamed: 0,order_id,product_name,reordered,is_organic
0,2539329,Soda,0,False
1,2539329,Organic Unsweetened Vanilla Almond Milk,0,True
2,2539329,Original Beef Jerky,0,False
3,2539329,Aged White Cheddar Popcorn,0,False
4,2539329,XL Pick-A-Size Paper Towel Rolls,0,False


In [20]:
df_orgainc['is_organic'].value_counts()

False    23163118
True     10655988
Name: is_organic, dtype: int64

In [21]:
organic = df_orgainc.pivot_table(values = 'order_id', index='reordered', columns = 'is_organic', aggfunc='count')
organic

is_organic,False,True
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9978458,3885288
1,13184660,6770700


In [22]:
from scipy.stats import fisher_exact

In [23]:
oddsratio, p_value = fisher_exact(organic)
oddsratio, p_value

(1.3188786966375019, 0.0)

The p-value obtained is lower than our chosen significance level. Therefore, we reject the null hypothesis. **It means that there is a relationship between organic food and reorder rate**. 

### Gluten free Vs reordered

Now lets check the relation ship between Gluten free food and  reorder rate. As both the is_glutanfree and reorderd are binary variables, we will use  Fisher exact test.

##### Hypothesis Testing

- Null Hypothesis: There is no relationship between Gluten free food and reorder rate.
- Alternative Hypothesis: There is relationship between Gluten free food and reorder rate.

Significance Level \alpha = 0.05

In [24]:
df_glut = df[['order_id', 'product_name', 'reordered']]

In [25]:
df_glut['is_glutenfree']=df_orgainc['product_name'].str.contains('gluten' and 'free')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [26]:
df_glut['is_glutenfree'].value_counts()

False    33810428
True         8678
Name: is_glutenfree, dtype: int64

In [27]:
glutanfree = df_glut.pivot_table(values = 'order_id', index='reordered', columns = 'is_glutenfree', aggfunc='count')
glutanfree

is_glutenfree,False,True
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13860335,3411
1,19950093,5267


In [29]:
oddsratio, p_value = fisher_exact(glutanfree)

In [30]:
oddsratio, p_value

(1.0727793411220674, 0.001382951603232037)

The p-value obtained is lower than our chosen significance level. Therefore, we reject the null hypothesis. **It means that there is a relationship between gluten free food and reorder rate**.

### Low Fat food Vs reordered

Now lets check the relation ship between Gluten free food and  reorder rate. As both the low fat food and reorderd are binary variables, we will use  Fisher exact test.

##### Hypothesis Testing

- Null Hypothesis: There is no relationship between low fat food and reorder rate.
- Alternative Hypothesis: There is relationship between low fat food and reorder rate.

Significance Level \alpha = 0.05

In [31]:
df_low_fat = df[['order_id', 'product_name', 'reordered']]

In [32]:
df_low_fat['is_lowfat']=df_low_fat['product_name'].str.contains('low' and 'fat')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
df_low_fat['is_lowfat'].value_counts()

False    33245961
True       573145
Name: is_lowfat, dtype: int64

In [34]:
lowfat = df_low_fat.pivot_table(values = 'order_id', index='reordered', columns = 'is_lowfat', aggfunc='count')
lowfat

is_lowfat,False,True
reordered,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13689203,174543
1,19556758,398602


In [35]:
oddsratio, p_value = fisher_exact(lowfat)

In [36]:
oddsratio, p_value

(1.5985209751203833, 0.0)

The p-value obtained is lower than our chosen significance level. Therefore, we reject the null hypothesis. **It means that there is a relationship between low fat free food and reorder rate**.