In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import re

pd.set_option('display.max_columns', None)  # to display all rows and columns while printing.
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_colwidth', -1)

params = {'legend.fontsize': '15',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large',
        #  'axes.prop_cycle': plt.cycler(color = plt.cm.Set2.colors),
        #  'image.cmap': 'Set2',
         'figure.figsize': (18, 7)}
plt.rcParams.update(params)

# plt.rcParams["image.cmap"] = "Set1"

# to change default color cycle
# plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Set1.colors)
# plt.rcParams['figure.figsize'] = 18, 7

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
plt.style.use('dark_background')  # to change the default values of plt to our interest.

Read the data.

In [None]:
df = pd.read_csv('../input/quantium-cleaned-dataset-part2/df1.csv', parse_dates= ['date'])
df.head()

Inserted a column 'year_month' in the dataframe.

In [None]:
df.insert(1, 'year_month',df['date'].dt.to_period('M'))     # .dt.to_period('M')
df.head()

We have changed dtypes for simplicity  and easier operations.

In [None]:
cols_with_changed_dtype = {'prod_name': 'category', 'prod_qty': 'category',
                           'lifestage': 'category', 'premium_customer': 'category',
                           'prod_comp': 'category'}
df = df.astype(cols_with_changed_dtype)
df.info()

We'll remove the stores which does not contain transaction data of all 12 months.

In [None]:
check = df.groupby('store_nbr')['year_month'].nunique()
check = check[check != 12]
print('Stores with less than 12 month transaction data:')
display(check)
stores_with_less_than_12_months = check.index.to_list()
del check

In [None]:
indices_to_drop = df[df['store_nbr'].isin(stores_with_less_than_12_months)].index
print(f'Shape before dropping: {df.shape}\n')
df = df.drop(indices_to_drop)
print(f'Shape after dropping: {df.shape}\n')
print(f'Number of samples that are dropped: {len(indices_to_drop)}')

In [None]:
check = df[df['store_nbr'].isin([77, 86, 88])].groupby(['store_nbr', 'year_month'])['tot_sales'].sum() # code to plot
colors = []
for store, month in check.index:
    if store == 77:
        colors.append('b')
    elif store == 86:
        colors.append('g')
    else:
        colors.append('r')

check.plot(kind = 'bar', color = colors, figsize = (20, 9))
plt.show()
del check, colors

In [None]:
df['yearly_sale'] = df.groupby('store_nbr')['tot_sales'].transform('sum')
df['yearly_custs'] = df.groupby('store_nbr')['lylty_card_nbr'].transform('nunique')
df['monthly_sale'] = df.groupby(['store_nbr', 'year_month'])['tot_sales'].transform('sum')
df['monthly_custs'] = df.groupby(['store_nbr', 'year_month'])['lylty_card_nbr'].transform('nunique')
# df['monthly_txn'] = df.groupby(['store_nbr', 'year_month'])['txn_id'].agg({'txn_id': 'nunique'})

In [None]:
# df1 = df.copy()
# data = df.groupby(['store_nbr', 'month']).apply(lambda subdf: subdf['lylty_card_nbr'].count())
# # data['count'] = df.groupby(['store_nbr', 'month']).apply(lambda subdf: subdf['lylty_card_nbr'].count())
# data.head(25)

I need to add another column: average transaction per customer in the store.

In [None]:
avg_trans = df.groupby('store_nbr').apply(lambda subdf: (subdf['txn_id'].nunique() / subdf['yearly_custs'].unique()))
avg_trans = avg_trans.astype('float64')
df['avg_txn_per_cust'] = df['store_nbr'].map(avg_trans)

In [None]:
# store_272 = df1.groupby('store_nbr').get_group(272)
# no_of_customers = store_272['no_of_custs'].unique()
# no_of_transactions = store_272['txn_id'].nunique()
# print(f'no of customers: {no_of_customers}')
# print(f'no of transactions: {no_of_transactions}')
# print((no_of_transactions / no_of_customers).round(4))

In [None]:
df.head()

"pre_df" is the dataset which contains samples only before the trial period.

"trial_df" is the datset which contains samples of trial period.

In [None]:
pre_df = df[df['date'] < "2019-02-01"]
trial_df = df[(df['date'] > "2019-01-31") & (df['date'] < "2019-05-01")]

In [None]:
min_date_in_trial_df, max_date_in_trial_df = min(trial_df['date']), max(trial_df['date'])
min_date_in_pre_df, max_date_in_pre_df =  min(pre_df['date']), max(pre_df['date'])
print(f'the trial_df dataframe consists of samples between {min_date_in_trial_df}, {max_date_in_trial_df}')
print(f'the pre_df dataframe consists of samples between {min_date_in_pre_df}, {max_date_in_pre_df}')

In [None]:
corrmat = pre_df.corr()
mask = np.triu(np.ones_like(corrmat, dtype=bool))
# plt.subplots(figsize = (25, 15))
sns.heatmap(corrmat, mask = mask, cmap = 'coolwarm', annot = True)
plt.xticks(rotation = 30)
plt.show()

In [None]:
# grp = pre_df.groupby('store_nbr')
# for name, subdf in grp:
#     if name not in [77, 86, 88]:
#         #it is a control store
#         subdf
#     else:
#         # it is a trial store
#         pass

'metrics_cols' are the features for correlation and ranking between trial stores and control stores.

In [None]:
metrics_cols = ['store_nbr', 'year_month', 'yearly_sale',
                'yearly_custs','monthly_sale', 'monthly_custs', 'avg_txn_per_cust']
# metrics_data = pre_trial_data.loc[:, metrics_cols]

In [None]:
def extract_metrics(df):
    subdf = df.loc[:, metrics_cols].set_index(['store_nbr', 'year_month']).sort_values(by = ['store_nbr', 'year_month'])
    subdf.drop_duplicates(inplace = True, keep = 'first')
    return subdf 

metrics_df = extract_metrics(pre_df)
metrics_df.head()

In [None]:
# metrics_df.xs('2018-09', level=1)

In [None]:
# metrics_df.index.get_level_values('year_month').nunique()

Function to find correlation between trial stores and control stores one by one.

In [None]:
def calc_corr(trial_store):
    '''
    input: It takes one trial store to compare other stores with.
    output: New dataframe with correlation and mean correlation.
    '''
    a=[]
    metrics = metrics_df[['monthly_sale', 'monthly_custs']] 
    for i in metrics.index:
        a.append(metrics.loc[trial_store].corrwith(metrics.loc[i[0]]))
    subdf = pd.DataFrame(a)
    subdf.index = metrics.index
    subdf = subdf.drop_duplicates()
    subdf.index = [s[0] for s in subdf.index]
    subdf.index.name ="store_nbr"
    subdf = subdf.abs()
    subdf['mean_corr'] = subdf.mean(axis=1)
    subdf.sort_values(by = 'mean_corr', ascending = False, inplace = True)
    return subdf

## Correlation with trial store: 77

In [None]:
corr_77 = calc_corr(77).drop(77)
# corr_77 = corr_77.drop(77)
corr_77.head(5)

In [None]:
corr_77[corr_77['mean_corr'].abs() > 0.7].plot(kind = 'bar', rot = 0, figsize = (18, 8))
plt.title('Correlation of trial store 77 with other stores')
plt.xlabel('store Number')
plt.ylabel('Correlation Co-efficient')
plt.show()

The store '233' with the highest score is selected as the control store for trial store '77'.

Now let's quantify how related it is to the trial store by using plots and some stats. Since monthly sales and customers are only parameters we can monitor. Therefore we'll just see these two parameters.

In [None]:
fig, ax = plt.subplots()
sns.distplot(metrics_df.loc[77]['monthly_sale'], color = 'r', ax = ax)
sns.distplot(metrics_df.loc[233]['monthly_sale'], color = 'g', ax = ax)
plt.legend(labels = ['77', '233'])
plt.show()

From the above plot we can see that there is difference in monthly sale in both the stores.

In [None]:
metrics_df.loc[77]['monthly_sale'].plot(kind = 'bar', color = 'g')
metrics_df.loc[233]['monthly_sale'].plot(kind = 'bar', color = 'r', alpha = 0.5)
plt.xticks(rotation = 0)
plt.xlabel('Month')
plt.ylabel('Monthly Sale')
plt.legend(labels = (77, 232))
plt.show()

Even though the monthly sale values are diffrent but we see a similar trend in the sales through out the period.

In [None]:
fig, ax = plt.subplots()
sns.distplot(metrics_df.loc[77]['monthly_custs'], color = 'r', ax = ax)
sns.distplot(metrics_df.loc[233]['monthly_custs'], color = 'g', ax = ax)
plt.legend(labels = ['77', '233'])
plt.show()

We can see that monthly customers are similar in both the stores.

In [None]:
metrics_df.loc[77]['monthly_custs'].plot(kind = 'bar', color = 'g')
metrics_df.loc[233]['monthly_custs'].plot(kind = 'bar', color = 'r', alpha = 0.5)
plt.xticks(rotation = 0)
plt.xlabel('Month')
plt.ylabel('Monthly Sale')
plt.legend(labels = (77, 232))
plt.show()

Even the trend in the number of customers every month follows a similar trend between the stores.

Let our null hypothesis be that both the trial store and our selected control store are similar. Now if we want to reject the null hypothesis then we must have pvalue close to zero.

In [None]:
from scipy.stats import ks_2samp,ttest_ind,t

In [None]:
metrics_df.head(2)

In [None]:
cols_under_consideration = ['monthly_sale', 'monthly_custs']
a=[]
for x in metrics_df[cols_under_consideration]:
    a.append(ks_2samp(metrics_df.loc[77][x], metrics_df.loc[233][x]))
a=pd.DataFrame(a, index = cols_under_consideration)
a.head()

From the dataframe above we can say that both are similar (pvalues are high close to 1). Hence we cannot reject our null hypothesis.

Assessment of Trial.

Now we'll compare the trial store with the control store in the trial period i.e. from Febraury 2019 to April 2019.

In [None]:
trial_metrics_df = extract_metrics(trial_df)
trial_metrics_df.head()

In [None]:
b = []
for x in trial_metrics_df[cols_under_consideration]:
    b.append(ks_2samp(trial_metrics_df.loc[77][x], trial_metrics_df.loc[233][x]))
b = pd.DataFrame(b, index = cols_under_consideration)
b.head()

Since both the pvalues are >5 we reject the null hypothesis. Since both the stores are similar in pre-trial but not in trial period hence we reject the null hypothesis.

Comparing each T-Value with 95% percentage significance critical t-value of 6 degrees of freedom (7 months of sample - 1)

In [None]:
print('critical t-value for 95% confidence level:')
t.ppf(0.95, 6)

We can see that t-value is greater than 95 percentile for febraury to april.

Therefore we can say that there was increase in sale in trial store than the control store during the trial period.

Let's plot the means for both the stores in trial period.

In [None]:
sns.distplot(trial_metrics_df.loc[77]['monthly_sale'])
sns.distplot(trial_metrics_df.loc[233]['monthly_sale'])
plt.legend(labels=['77','233'])

In [None]:
sns.distplot(trial_metrics_df.loc[77]['monthly_custs'])
sns.distplot(trial_metrics_df.loc[233]['monthly_custs'])
plt.legend(labels=['77','233'])

We can see that the distribution of monthly sale and monthly customers of both the stores in the trial period is much different than the distribution of monthly sale and monthly customers in pre-trial period.

The results show that the trial store 77 is significantly different to its control store in the trial period as the trial store performance lies outside the 5% to 95% confidence interval of the control store in two of the three trial months.

We can also see that there is significant increase in sales of chips in trial stores in the trial period.

## Correlation with trial store: 86

In [None]:
corr_86 = calc_corr(86).drop(86)
corr_86.head()

In [None]:
corr_86[corr_86['mean_corr'].abs() > 0.7].plot(kind = 'bar', rot = 0, figsize = (18, 8))
plt.title('Correlation of trial store 86 with other stores')
plt.xlabel('store Number')
plt.ylabel('Correlation Co-efficient')
plt.show()

The store '155' with the highest score is selected as the control store for trial store '86'.

Now let's quantify how related it is to the trial store by using plots and some stats. Since monthly sales and customers are only parameters we can monitor. Therefore we'll just see these two parameters.

In [None]:
fig, ax = plt.subplots()
sns.distplot(metrics_df.loc[86]['monthly_sale'], color = 'r', ax = ax)
sns.distplot(metrics_df.loc[155]['monthly_sale'], color = 'g', ax = ax)
plt.legend(labels = ['86', '155'])
plt.show()

From the above plot we can see that there is difference in monthly sale in both the stores. But on avarage both the stores are similar.

In [None]:
metrics_df.loc[86]['monthly_sale'].plot(kind = 'bar', color = 'g')
metrics_df.loc[155]['monthly_sale'].plot(kind = 'bar', color = 'r', alpha = 0.5)
plt.xticks(rotation = 0)
plt.xlabel('Month')
plt.ylabel('Monthly Sale')
plt.legend(labels = (86, 155))
plt.show()

Even though the monthly sale values are diffrent but we see a similar trend in the sales through out the period.

In [None]:
fig, ax = plt.subplots()
sns.distplot(metrics_df.loc[86]['monthly_custs'], color = 'r', ax = ax)
sns.distplot(metrics_df.loc[155]['monthly_custs'], color = 'g', ax = ax)
plt.legend(labels = ['86', '155'])
plt.show()

We can see that monthly customers are similar in both the stores.

In [None]:
metrics_df.loc[86]['monthly_custs'].plot(kind = 'bar', color = 'g')
metrics_df.loc[155]['monthly_custs'].plot(kind = 'bar', color = 'r', alpha = 0.5)
plt.xticks(rotation = 0)
plt.xlabel('Month')
plt.ylabel('Monthly Sale')
plt.legend(labels = (86, 155))
plt.show()

Even the trend in the number of customers every month follows a similar trend between the stores.

Let our null hypothesis be that both the trial store and our selected control store are similar. Now if we want to reject the null hypothesis then we must have pvalue close to zero.

In [None]:
cols_under_consideration = ['monthly_sale', 'monthly_custs']
a=[]
for x in metrics_df[cols_under_consideration]:
    a.append(ks_2samp(metrics_df.loc[86][x], metrics_df.loc[155][x]))
a=pd.DataFrame(a, index = cols_under_consideration)
a.head()

From the dataframe above we can say that both are similar (pvalues are high close to 1). Hence we cannot reject our null hypothesis.

Assessment of Trial.

Now we'll compare the trial store with the control store in the trial period i.e. from Febraury 2019 to April 2019.

In [None]:
b = []
for x in trial_metrics_df[cols_under_consideration]:
    b.append(ks_2samp(trial_metrics_df.loc[86][x], trial_metrics_df.loc[155][x]))
b = pd.DataFrame(b, index = cols_under_consideration)
b.head()


Since all of the p-values are high (say more than 0.05), we reject the null hypothesis i.e. there means are significantly different.

Comparing each T-Value with 95% percentage significance critical t-value of 6 degrees of freedom (7 months of sample - 1)

In [None]:
print('critical t-value for 95% confidence level:')
t.ppf(0.95, 6)

We can see that t-value is greater than 95 percentile for febraury to april.

The results show that the trial in store 86 is significantly different to its control store in the trial period as the trial store performance lies outside of the 5% to 95% confidence interval of the control store in two of the three trial months.

Let's plot the means for both the stores in trial period.

In [None]:
sns.distplot(trial_metrics_df.loc[86]['monthly_sale'])
sns.distplot(trial_metrics_df.loc[155]['monthly_sale'])
plt.legend(labels=['86','155'])

In [None]:
sns.distplot(trial_metrics_df.loc[86]['monthly_custs'])
sns.distplot(trial_metrics_df.loc[155]['monthly_custs'])
plt.legend(labels=['86','155'])

We can see that the distribution of monthly sale and monthly customers of both the stores in the trial period is much different than the distribution of monthly sale and monthly customers in pre-trial period.

The results show that the trial store 86 is significantly different to its control store in the trial period as the trial store performance lies outside the 5% to 95% confidence interval of the control store in two of the three trial months.

We can also see that there is significant increase in sales of chips in trial stores in the trial period.

## Correlation with trial store: 88

In [None]:
corr_88 = calc_corr(88).drop(88)
corr_88.head()

In [None]:
corr_88[corr_88['mean_corr'].abs() > 0.55].plot(kind = 'bar', rot = 0, figsize = (18, 8))
plt.title('Correlation of trial store 88 with other stores')
plt.xlabel('store Number')
plt.ylabel('Correlation Co-efficient')
plt.show()

The store '14' has the highest score but we'll consider store '237' as the control store since the monthly sales is much correlated with it. Therefore store '237' is selected as the control store for trial store '88'.

Now let's quantify how related it is to the trial store by using plots and some stats. Since monthly sales and customers are only parameters we can monitor. Therefore we'll just see these two parameters.

In [None]:
fig, ax = plt.subplots()
sns.distplot(metrics_df.loc[88]['monthly_sale'], color = 'r', ax = ax)
sns.distplot(metrics_df.loc[237]['monthly_sale'], color = 'g', ax = ax)
plt.legend(labels = ['88', '237'])
plt.show()

From the above plot we can see that there is difference in monthly sale in both the stores. But on avarage both the stores are similar.

In [None]:
metrics_df.loc[88]['monthly_sale'].plot(kind = 'bar', color = 'g')
metrics_df.loc[237]['monthly_sale'].plot(kind = 'bar', color = 'r', alpha = 0.5)
plt.xticks(rotation = 0)
plt.xlabel('Month')
plt.ylabel('Monthly Sale')
plt.legend(labels = (88, 237))
plt.show()

Even though the monthly sale values are diffrent but we see a similar trend in the sales through out the period.

In [None]:
fig, ax = plt.subplots()
sns.distplot(metrics_df.loc[88]['monthly_custs'], color = 'r', ax = ax)
sns.distplot(metrics_df.loc[237]['monthly_custs'], color = 'g', ax = ax)
plt.legend(labels = ['88', '237'])
plt.show()

We can see that monthly customers are similar in both the stores.

In [None]:
metrics_df.loc[88]['monthly_custs'].plot(kind = 'bar', color = 'g')
metrics_df.loc[237]['monthly_custs'].plot(kind = 'bar', color = 'r', alpha = 0.5)
plt.xticks(rotation = 0)
plt.xlabel('Month')
plt.ylabel('Monthly Sale')
plt.legend(labels = (88, 237))
plt.show()

Even the trend in the number of customers every month follows a similar trend between the stores.

Let our null hypothesis be that both the trial store and our selected control store are similar. Now if we want to reject the null hypothesis then we must have pvalue close to zero.

In [None]:
cols_under_consideration = ['monthly_sale', 'monthly_custs']
a=[]
for x in metrics_df[cols_under_consideration]:
    a.append(ks_2samp(metrics_df.loc[88][x], metrics_df.loc[237][x]))
a=pd.DataFrame(a, index = cols_under_consideration)
a.head()

From the dataframe above we can say that both are similar (pvalues are high close to 1). Hence we cannot reject our null hypothesis.

Assessment of Trial.

Now we'll compare the trial store with the control store in the trial period i.e. from Febraury 2019 to April 2019.

In [None]:
b = []
for x in trial_metrics_df[cols_under_consideration]:
    b.append(ks_2samp(trial_metrics_df.loc[88][x], trial_metrics_df.loc[237][x]))
b = pd.DataFrame(b, index = cols_under_consideration)
b.head()


Since all of the p-values are high (say more than 0.05), we reject the null hypothesis i.e. there means are significantly different.

Comparing each T-Value with 95% percentage significance critical t-value of 6 degrees of freedom (7 months of sample - 1)

In [None]:
print('critical t-value for 95% confidence level:')
t.ppf(0.95, 6)

We can see that t-value is greater than 95 percentile for febraury to april.

The results show that the trial in store 88 is significantly different to its control store in the trial period as the trial store performance lies outside of the 5% to 95% confidence interval of the control store in two of the three trial months.

Let's plot the means for both the stores in trial period.

In [None]:
sns.distplot(trial_metrics_df.loc[88]['monthly_sale'])
sns.distplot(trial_metrics_df.loc[237]['monthly_sale'])
plt.legend(labels=['88','237'])

In [None]:
sns.distplot(trial_metrics_df.loc[88]['monthly_custs'])
sns.distplot(trial_metrics_df.loc[237]['monthly_custs'])
plt.legend(labels=['88','237'])

We can see that the distribution of monthly sale and monthly customers of both the stores in the trial period is much different than the distribution of monthly sale and monthly customers in pre-trial period.

The results show that the trial store 88 is significantly different to its control store in the trial period as the trial store performance lies outside the 5% to 95% confidence interval of the control store in two of the three trial months.

We can also see that there is significant increase in sales of chips in trial stores in the trial period.

Conclusion

The results for trial stores 77 and 88 during the trial period show a significant difference in at least two of the three trial months but this is not the case for trial store 86. We can check with the client if the implementation of the trial was different in trial store 86 but overall, the trial shows a significant increase in sales.