In [None]:
# pip install needed packages
!pip install chart_studio

# Import the needed packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import chart_studio.plotly as py
from plotly.subplots import make_subplots
import cufflinks as cf
%matplotlib inline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
# Import all the needed files for the analysis
product = pd.read_csv('/kaggle/input/dunnhumby-the-complete-journey/product.csv')
transaction_data = pd.read_csv('/kaggle/input/dunnhumby-the-complete-journey/transaction_data.csv')

## I. Data Transformations and Cleaning

First, we examine the dataset. The transactions dataset contains 2.6M transactions, with 12 fields.

In [None]:
transaction_data.head()

In [None]:
transaction_data.shape

The next step is to perform data transformations to prepare for all the subsequent plotting. 

In [None]:
# Create Daily aggregate data for product trends
transaction_data = transaction_data[['household_key','DAY','PRODUCT_ID', 'QUANTITY','SALES_VALUE']] \
            .merge(product[['PRODUCT_ID','COMMODITY_DESC']], on='PRODUCT_ID')

# Remove blanks and 'COUPON/MISC ITEMS', and '(CORP USE ONLY)' in the COMMODITY_DESC field as they won't be helpful in the analysis anyway
transaction_data = transaction_data[~transaction_data['COMMODITY_DESC'].isin(['',' ','COUPON/MISC ITEMS','(CORP USE ONLY)'])] 

# Do a daily summary with the following metrics: sales, quantity, number of households
daily_sales = transaction_data.groupby(['COMMODITY_DESC', 'DAY']) \
            .agg({'SALES_VALUE':'sum', 'QUANTITY':'sum', 'household_key':pd.Series.nunique}) \
            .rename(columns = {'household_key':'HOUSEHOLDS'}) \
            .reset_index()

daily_sales.head()

## II. Plotly Charts

### Chart 1: BAR CHART (Top 5 Commodities in terms of Sales)

In [None]:
# Prepare dataframes
df_top5 = daily_sales[['COMMODITY_DESC','SALES_VALUE']]\
            .groupby(['COMMODITY_DESC']).sum().reset_index()
df_top5 = df_top5[df_top5['COMMODITY_DESC'] != ''].sort_values(by = 'SALES_VALUE', ascending=False)[:5] \
            .sort_values(by = 'SALES_VALUE')

In [None]:
# Bar Charts Using Plotly Express
fig = px.bar(x = df_top5.SALES_VALUE, 
             y = df_top5.COMMODITY_DESC,
             labels = {
                 'y' : 'Commodities',
                 'x' : 'Sales'
             },
             title = 'Top 5 Commodities by Sales',
             template = 'simple_white')
fig.show()

**Interpretation:** Softdrinks is the top commodity in terms of sales, with 328K dollars over the time period covered. Beef comes next, with 312K sales in the time period covered.  

### Chart 2: SIDE BY SIDE BAR CHARTS (Top 5 Commodities in terms of Sales, and Top 5 in terms of Quantity)

In [None]:
# Prepare dataframe for quantity
df_top5_quantity = daily_sales[['COMMODITY_DESC','QUANTITY']]\
            .groupby(['COMMODITY_DESC']).sum().reset_index()
df_top5_quantity = df_top5_quantity[df_top5_quantity['COMMODITY_DESC'] != ''] \
            .sort_values(by = 'QUANTITY', ascending=False)[:5]

df_top5 = df_top5.sort_values(by = 'SALES_VALUE', ascending=False)[:5]

In [None]:
# Add the 2 traces
trace1 = go.Bar(
            x = df_top5.COMMODITY_DESC,
            y = df_top5.SALES_VALUE,
            name = 'Sales $')

trace2 = go.Bar(
            x = df_top5_quantity.COMMODITY_DESC,
            y = df_top5_quantity.QUANTITY,
            name = 'Quantity')

# Set-up subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Sales $", "Quantity"))

fig.add_trace(
    trace1,
    row=1, col=1
)

fig.add_trace(
    trace2,
    row=1, col=2
)

fig.update_layout(height=600, width=800, template = 'simple_white', title_text="Top 5 Commodities in Sales and Quantity")

fig.show()

**Interpretation:** Fuels and Breads appeared in the top 5 when quantity was considered. On the other hand, Beef and Meat Dinners were removed from the top categories. Meat / Beef are high value purchases, making them top commodities, sales-wise. Meanwhile, Soft drinks remain to be the top commodity whether value or volume of purchases is considered. 

### Chart 3: LINE CHARTS (Daily Sales for the Top 5 Commodities)

In [None]:
# Prepare dataset
list_top5_sales = df_top5['COMMODITY_DESC']

daily_sales_top5 = daily_sales[daily_sales['COMMODITY_DESC'] \
                               .isin(list_top5_sales)] \
                                [['COMMODITY_DESC', 'SALES_VALUE', 'DAY']] \
                               .groupby(['COMMODITY_DESC','DAY']).sum().reset_index()

# Plot
fig = px.line(daily_sales_top5, 
              x = daily_sales_top5.DAY, 
              y = daily_sales_top5.SALES_VALUE,
             color = daily_sales_top5.COMMODITY_DESC,
             title = 'Daily Sales for the Top 5 Commodities',
             template = 'simple_white')

fig.show()

It's very difficult to see the patterns this way, and computing for moving averages may help to visualize the data better.

### Chart 4: BOX PLOTS (Distribution of the Top 5 Commodities)

In [None]:
fig = px.box(daily_sales_top5, 
              x = daily_sales_top5.COMMODITY_DESC, 
              y = daily_sales_top5.SALES_VALUE,
             color = daily_sales_top5.COMMODITY_DESC,
             title = 'Sales Distribution for the Top 5 Commodities',
             template = 'simple_white')
fig.show()

**Interpretation:** Softdrinks appeared to have the biggest variation, with a number of outliers pulling up the average value. Beef also has a big variation with some outliers present.

### Chart 5: BUBBLE CHART (Sales, Quantity, and Number of Households Buying)  

In [None]:
# Do a summary with the following metrics: sales, quantity, number of households
sales_agg = transaction_data.groupby(['COMMODITY_DESC']) \
            .agg({'SALES_VALUE':'sum', 'QUANTITY':'sum', 'household_key':pd.Series.nunique}) \
            .rename(columns = {'household_key':'HOUSEHOLDS'}) \
            .reset_index()

In [None]:
fig = px.scatter(sales_agg, 
                 x='QUANTITY', 
                 y='SALES_VALUE',
                 size='HOUSEHOLDS', 
                 color='COMMODITY_DESC',
                 hover_name='COMMODITY_DESC',
                 size_max=60,
                 title = 'Sales, Volume, and Household Counts for Various Consumer Commodities',
                 template = 'simple_white')
fig.update_layout(showlegend=False)
fig.add_annotation(text='Sizes of the bubbles represent the number of households buying the product',
                  xref='paper', yref='paper',
                  x=-0.02, y=1.11, showarrow=False)
fig.show()

**Interpretation:** Softdrinks pulls away from the other categories in terms of quantity, although it is almost at par with Beef in terms of sales value. Softdrinks is highly volume-driven - it sells good because of the sheer number of purchases, while Beef is obviously a price-driven commodity.  