In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from IPython.core.display import display, HTML
import ipywidgets as widgets
from IPython.display import display,clear_output
from ipywidgets import Output
from ipywidgets import TwoByTwoLayout
# Utils widgets
from ipywidgets import Button, Layout, jslink, IntText, IntSlider, Box, VBox

from scipy.stats import pearsonr
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.templates
from PIL import Image
from IPython.display import Image as img
from plotly.offline import plot, iplot, init_notebook_mode
import random
pd.options.mode.chained_assignment = None  # default='warn'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Load dataset 
article = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customer = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transaction = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

# Content

This notebook is built to explore the dataset.

It is an ongoing work, and please stay tuned.

* Null values of each dataset
* Article ranking based on different variables
* Histgram of customer status data broken down by age
* Purchase metrics change over the time (number of customers/total volume/number of articles purchased per day)
* How long does it take a customer to make the next purchase, on average?
* Average age of customer changes over the time
* Top 12 articles sold in the week before training dataset ending
* number of articles purchased by customer
* How many (much %) of users doesn't purchase equal or more than 12 articles?
* Number of daily transactions per sales channel


# Null values of each dataset

In [None]:
# article.csv
article.isna().sum()

In [None]:
# customer.csv
customer.isna().sum()

In [None]:
# transaction_train.csv
transaction.isna().sum()

# Article ranking based on different variables
1. Choose a variable you are interested in (e.g. prod_name)
2. Choose number of rows of the result (e.g. 20 rows)
3. Click the button and you will see the ranking regarding how many articles per variable value

As an example, let's see the top 20 product names in terms of number of articles.

In [None]:
def number_article(v,row):
    result = article.groupby([v]).nunique().reset_index()[[v,'article_id']].sort_values(by='article_id',ascending = False).rename(columns={"article_id": "nr_article"})
    return result.head(row)
number_article('prod_name',20)

#### You may need to fork this notebook to interact with the widget from your own end.

In [None]:
article_column = list(article.columns)
article_column.remove('article_id')

# display widgets
output = Output()
start = Button(description="Click me")
start.style.button_color = 'lightblue'

v_widget = widgets.Dropdown(
    options=article_column,
    value=list(list(article_column))[1],
    description='variable',
    disabled=False,
)

row_widgets = widgets.IntSlider(
    value=20,
    min=0,
    max=100,
    step=5,
    description='row',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
)


def click_start(b):
    with output:
        clear_output()
        print(number_article(v_widget.value,
                            row_widgets.value))
        

start.on_click(click_start)

display(v_widget,
        row_widgets,
        start,
        output)

# Histgram of customer status data broken down by age

In [None]:
# replace club_member_status null values with 'None'
# replace Active null values with 0
# replace FN null values with 0

customer['club_member_status'] = customer['club_member_status'].fillna('None')
customer['Active'] = customer['Active'].fillna(0)
customer['FN'] = customer['FN'].fillna(0)
customer['fashion_news_frequency'] = customer['fashion_news_frequency'].fillna('nan')

fig = px.histogram(customer, x="age", color="club_member_status")
fig.update_layout(
    title_text='Histgram of customer age per club_member_status', # title of plot
    xaxis_title_text='Age', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)
fig.show()

In [None]:
fig = px.histogram(customer, x="age", color="Active")
fig.update_layout(
    title_text='Histgram of customer age per Active status', # title of plot
    xaxis_title_text='Age', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)
fig.show()

In [None]:
fig = px.histogram(customer, x="age", color="FN")
fig.update_layout(
    title_text='Histgram of customer age per FN status', # title of plot
    xaxis_title_text='Age', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)
fig.show()

In [None]:
fig = px.histogram(customer, x="age", color="fashion_news_frequency")
fig.update_layout(
    title_text='Histgram of customer age per fashion_news_frequency', # title of plot
    xaxis_title_text='Age', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)
fig.show()

# Purchase metrics change over the time

In [None]:
# aggregate the transaction by date
transaction_aggr = transaction.groupby(['t_dat']).nunique().reset_index()[['t_dat','customer_id','article_id','sales_channel_id']]

# create a column showing sum per user per day
transaction_aggr['sum'] = transaction.groupby(['t_dat']).sum().reset_index()[['price']]
transaction_aggr = transaction_aggr.rename(columns={"customer_id": "nr_customer",
                                                   "article_id":"nr_article",
                                                   "sales_channel_id":"nr_sales_channels",
                                                   "sum":"total_volume"})
# plot the chart
fig = go.Figure()

variables = ['nr_customer','nr_article','nr_sales_channels','total_volume']

for v in variables:
    fig.add_trace(go.Scatter(mode="lines", x=transaction_aggr["t_dat"], y=transaction_aggr[v], name=v))

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)

fig.update_layout(
    title_text='Purchase metrics change over the time' # title of plot
)

fig.show()

# How long does it take a customer to make the next purchase, on average?

As the time goes, it takes users shorter time to make the next purchase. This trend appears especially since 2020 April, and I guess it is closely related with the pandamic when people are more likely to purchase online.

In [None]:
# select relevant columns
transaction_date = transaction[['t_dat','customer_id']]

# create a column showing the time the next order was made, grouping by customer id
transaction_date['shift_t_dat'] = transaction_date.groupby("customer_id").shift(-1)

# remove rows that have null
transaction_date = transaction_date.dropna()

# calculate the difference between last order and the next one
transaction_date['t_dat_dff'] = pd.to_datetime(transaction_date['shift_t_dat']) - pd.to_datetime(transaction_date['t_dat'])

# remove time difference equaling to 0, which indicates users making the orders in the same day
transaction_date_remove_zero = transaction_date.loc[transaction_date['t_dat_dff'] != '0 day']

# take the average number of days grouping by date
transaction_date_remove_zero_aggr = transaction_date_remove_zero.groupby(['t_dat']).mean().reset_index()
transaction_date_remove_zero_aggr['t_dat_dff'] = pd.to_timedelta(transaction_date_remove_zero_aggr.t_dat_dff, errors='coerce').dt.days

# plot the chart
fig = go.Figure()

fig.add_trace(go.Scatter(mode="lines", x=transaction_date_remove_zero_aggr["t_dat"], y=transaction_date_remove_zero_aggr['t_dat_dff'], name="Number of days since last purchase"))
fig.update_layout(
    title_text='Average number of days the next order is purchased on the same customer' # title of plot
)
fig.show()

# Average age of customer changes over the time

More younger customers come to buy as the time goes.

In [None]:
# join customer dataset with transactions
customer_transaction = customer.merge(transaction,how='left', on=None, left_on='customer_id', right_on='customer_id', suffixes=('_x', '_y'))
# age of each customer
customer_transaction_aggr = customer_transaction.groupby(['t_dat','customer_id']).mean().reset_index()[['age','customer_id','t_dat']]
# mean age of customer on each day
age_transaction_aggr = customer_transaction_aggr.groupby(['t_dat']).mean().reset_index()[['age','t_dat']]
age_transaction_aggr['t_dat'] = pd.to_datetime(age_transaction_aggr['t_dat'])

# plot the chart
fig = px.scatter(age_transaction_aggr,x="t_dat", y="age", trendline="ols",title="Average age of customer changes along the time")
fig.show()

# Top 12 articles sold in the week before training dataset ending

For time series prediction, usually we will see the most recent observations have most influence on the predicted outcomes.

Thus, let's take a look at the top 12 articles in the last week of training dataset.

In [None]:
# select data in the last week
transaction_last_week = transaction.loc[transaction['t_dat'].isin(['2020-09-22',
                                                                   '2020-09-21',
                                                                   '2020-09-20',
                                                                   '2020-09-19',
                                                                   '2020-09-18',
                                                                   '2020-09-17',
                                                                   '2020-09-16'])]
# get the top 12 articles sold in the last week
top_12_last_week = transaction_last_week.groupby(['article_id']).count().reset_index().sort_values(by='customer_id',ascending = False).head(12)
# to get detail of these 12 articles
top_12_last_week_info = top_12_last_week.merge(article,how='inner', on=None, left_on='article_id', right_on='article_id', suffixes=('_x', '_y'))
# Select interested columns
top_12_last_week_info[['article_id','prod_name','product_type_name','product_group_name','colour_group_name','perceived_colour_value_name','department_name','index_name']]

# number of articles purchased by customer

In [None]:
transaction_customer_aggr = transaction.groupby(['customer_id']).nunique().reset_index()
print("Number of articles purchased per customer - mean " + str(round(transaction_customer_aggr['article_id'].mean(),1)))
print("Number of articles purchased per customer - median " + str(round(transaction_customer_aggr['article_id'].median(),1)))
print("Number of articles purchased per customer - 75th percentile " + str(round(transaction_customer_aggr['article_id'].quantile(.75),1)))
print("Number of articles purchased per customer - 95th percentile " + str(round(transaction_customer_aggr['article_id'].quantile(.95),1)))
print("Number of articles purchased per customer - 99th percentile " + str(round(transaction_customer_aggr['article_id'].quantile(.99),1)))

# How many (much %) of users doesn't purchase equal or more than 12 articles?

In [None]:
print(str(transaction_customer_aggr['customer_id'].nunique()) + " customers in total")

print("and " + str(transaction_customer_aggr.loc[transaction_customer_aggr['article_id'] < 12]['customer_id'].nunique()) + " customers purchased articles fewer than 12.")

print(str(round(transaction_customer_aggr.loc[transaction_customer_aggr['article_id'] < 12]['customer_id'].nunique() * 100 /
     transaction_customer_aggr['customer_id'].nunique(),1)) + " percentage of customers purchased fewer than 12 articles.")

# number of daily transactions per sales channel

In [None]:
transaction_sales = transaction.groupby(['t_dat','sales_channel_id']).nunique().reset_index()
fig = px.line(transaction_sales, x='t_dat', y='customer_id', color='sales_channel_id',title="Nr of articles purchased per sales channel")
fig.show()