In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px

pd.options.display.float_format = '{:,}'.format

In [None]:
df_t = pd.read_csv("../input/dunnhumby-the-complete-journey/transaction_data.csv")
df_p = pd.read_csv("../input/dunnhumby-the-complete-journey/product.csv")
df_d = pd.read_csv("../input/dunnhumby-the-complete-journey/hh_demographic.csv")
df = df_t.merge(df_p, how = 'inner', on = 'PRODUCT_ID')
df = df.merge(df_d, how = 'left', on = 'household_key')

In [None]:
df['STORE_ID'] = df['STORE_ID'].astype(str)

* Most of the data contains no null values.
* The categorical data added from hh_demographic shows 45% missing.

In [None]:
df.isna().sum() / df.shape[0]

* Even though we have more than 52 weeks of data, we see hardly any seasonality. We observe strong growth up until week 20 and then a relatively flat trend as we move forward. 
* I suspect this has to do with increasing the number of stores.

In [None]:
df_sales_by_week = df.groupby(['WEEK_NO'])['SALES_VALUE'].sum().reset_index().sort_values(by='WEEK_NO', ascending = True)
df_sales_by_week['SALES_VALUE_4_WEEK_MA'] = df_sales_by_week['SALES_VALUE'].rolling(4).mean()
px.line(df_sales_by_week, 
            x = 'WEEK_NO', 
            y = ['SALES_VALUE','SALES_VALUE_4_WEEK_MA'], 
            title = 'Sales by week number', 
            color_discrete_sequence = ['#3969b1','#7a797a']
       )

* As I suspected, we see a large increase week over week in store growth until week 16 when then number of stores begins to decline.

In [None]:
df_store_cnt = df.groupby(['WEEK_NO'])['STORE_ID'].agg(['nunique']).reset_index()
df_store_cnt.rename(columns = {'nunique':'UNIQUE_COUNT'}, inplace = True)
fig = px.line(df_store_cnt, 
            x = 'WEEK_NO', 
            y ='UNIQUE_COUNT',
            title = 'Stores with sales per week',
            width = 1350,
            color_discrete_sequence = ['#3969b1']
       )
fig.add_shape(
        # Line Horizontal
            type="circle",
            x0=15.9,
            y0=158,
            x1=16.2,
            y1=159,
            line=dict(
                color="Red",
                width=4
               
                #dash="dashdot",
            ),
    )

Next, I'm interested to see sales / store by week to look for any obvious trends. Stores flatlined and our efficiency of sales per week held fairly flat. I would argue this is a great metric to track for red flags in the future.

In [None]:
df_sales_per_store = df_sales_by_week.merge(df_store_cnt, 
                                                how = 'left',
                                                on = 'WEEK_NO')
df_sales_per_store['SALES_PER_STORE'] = round( df_sales_per_store['SALES_VALUE'] / df_sales_per_store['UNIQUE_COUNT'], 1)
px.line(df_sales_per_store, 
            x = 'WEEK_NO', 
            y ='SALES_PER_STORE',
            title = 'Sales per store by week',
            width = 1350,
            color_discrete_sequence = ['#3969b1']
       )

### What demographics are driving sales?

Breaking down the top 100 commodities by household size and composition. At first glance, **2 adults no kids** make up the majority of revenue. Scaling this down to a "standardized family size", Females and couples seem to be the driver.

*Please note: I did not attempt to account for children as it is difficult to come to a fair multiplier*


In [None]:
px.bar(df_hh_comp_size, 
       x = 'SALES_VALUE', 
       y = 'HH_COMP_DESC', 
       orientation = 'h',  
       width=600, 
       height=400, 
       title = 'Sales by Household Comp'
       facet_col = 'SCALED')



In [None]:
df_commodity = df.groupby(['COMMODITY_DESC','HOUSEHOLD_SIZE_DESC', 'HH_COMP_DESC'])['SALES_VALUE'].sum().reset_index().sort_values(by='SALES_VALUE', ascending = False).iloc[:100,:]
df_hh_comp_size = df.groupby('HH_COMP_DESC')['SALES_VALUE'].sum().reset_index().sort_values(by= 'SALES_VALUE', ascending = False)

#Calculate scaled values
df_hh_comp_size['SALES_VALUE_SCALED'] = np.where( df_hh_comp_size['HH_COMP_DESC'].str[0] == '2', df_hh_comp_size['SALES_VALUE'] / 2, df_hh_comp_size['SALES_VALUE'] )
df_hh_comp_size = df_hh_comp_size.melt(id_vars=['HH_COMP_DESC'], var_name='IS_SCALED', value_name='SALES_VALUE')

#Clean up scaled rows
df_hh_comp_size['IS_SCALED'].replace('SALES_VALUE_SCALED', 'TRUE', inplace = True)
df_hh_comp_size['IS_SCALED'].replace('SALES_VALUE', 'FALSE', inplace = True)

px.bar(df_hh_comp_size, 
       x = 'SALES_VALUE', 
       y = 'HH_COMP_DESC', 
       orientation = 'h',  
       width=800, 
       height=400, 
       title = 'Sales by Household Comp',
       facet_col = 'IS_SCALED')

Looking by the purchase descriptions, we can observe that "coupon/misc" are driving the majority of sales. I wonder what this is?

In [None]:
px.bar(df_commodity, 
       x = 'SALES_VALUE', 
       y = 'COMMODITY_DESC', 
       facet_col = 'HOUSEHOLD_SIZE_DESC', 
       category_orders={'HOUSEHOLD_SIZE_DESC': ['1','2','3','4','5+']}, 
       color = 'HH_COMP_DESC')

* A majority of the **misc spend** is on GASOLINE-REG UNLEADED.

In [None]:
df_misc = df.query("HOUSEHOLD_SIZE_DESC == '2' & COMMODITY_DESC == 'COUPON/MISC ITEMS'").groupby('SUB_COMMODITY_DESC')['SALES_VALUE'].sum().reset_index().sort_values(by='SALES_VALUE', ascending = False)
df_misc['SALES_VALUE'] = df_misc['SALES_VALUE'].round(0)

In [None]:
df_misc.head(10)

Once again, the driving factor for sales is gas.

In [None]:
df_hh_gas = df.query("COMMODITY_DESC == 'COUPON/MISC ITEMS'").groupby(['HH_COMP_DESC','SUB_COMMODITY_DESC'])['SALES_VALUE'].sum().reset_index().sort_values(by='SALES_VALUE', ascending=False)

In [None]:
px.bar(df_hh_gas,
       x = 'SALES_VALUE', 
       y = 'SUB_COMMODITY_DESC',
       color = 'HH_COMP_DESC',
       width = 1000
      )

### Discounts

What is the most discounted item? 

1. Soft Drinks
2. Milk
3. Meat
4. Cheese
5. Gas
6. Frozen Dinners

In [None]:
df_discount = df.groupby(['SUB_COMMODITY_DESC'])['RETAIL_DISC'].sum().reset_index().sort_values(by='RETAIL_DISC', ascending='FALSE')

px.bar(df_discount.iloc[:10,:], 
           x = 'RETAIL_DISC', 
           y = 'SUB_COMMODITY_DESC', 
           title = 'What are the top 10 discounted products?'
    )