In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
### Articles

In [None]:
# read data
df_articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
# drop duplicates
df_articles.drop_duplicates(inplace=True)
# reduce memory usage
df_articles['article_id'] = df_articles['article_id'].astype('int32')
# display a few rows
display(df_articles.head())
print()
# display information
display(df_articles.info())
print()
# missing information %
print("Missing values (%):")
print(df_articles.isna().sum() * 100 / len(df_articles))

In [None]:
for col in df_articles.columns:
    print(col, ":")
    print(" ", df_articles[col].nunique(), "distinct values")

### Customers

In [None]:
# read data
df_customers = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
# drop duplicates
df_customers.drop_duplicates(inplace=True)
# reduce memory usage
df_customers['customer_id'] =\
    df_customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
# display a few rows
display(df_customers.head())
print()
# display information
display(df_customers.info())
print()
# missing information %
print("Missing values (%):")
print(df_customers.isna().sum() * 100 / len(df_customers))

### Transactions

In [None]:
# read data
df_transactions = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
# reduce memory usage
df_transactions['article_id'] = df_transactions['article_id'].astype('int32')
df_transactions['customer_id'] =\
    df_transactions['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
# handle the date column
df_transactions.t_dat = pd.to_datetime(df_transactions.t_dat)
df_transactions['year'] = (df_transactions.t_dat.dt.year-2000).astype('int8')
df_transactions['month'] = (df_transactions.t_dat.dt.month).astype('int8')
df_transactions['day'] = (df_transactions.t_dat.dt.day).astype('int8')
#del df_transactions['t_dat']
# display a few rows
display(df_transactions)
print()
# display information
display(df_transactions.info())
print()
# missing information %
print("Missing values (%):")
print(df_transactions.isna().sum() * 100 / len(df_transactions))

The transaction data from period **20 Sept 2018** till **22 Sept 2020** were given.
Select full year data in **2019** for analysis.

In [None]:
df_transactions = df_transactions[df_transactions['year']==19]

# Data Analysis
### Which articles are the bestsellers?

In [None]:
bestsellers_ranking = df_transactions.groupby('article_id').count().sort_values(by='customer_id', ascending=False)
bestsellers_ranking.head(5)

The top bestseller article has an average sales of 29869/365 ~= 82 units per day. The bestseller ranked number five has an average sales of 12869/365 ~= 35 units per day, which is less than half of the sales of the top bestseller.

In [None]:
# add 2019 sales data to df_articles
article_sales = df_transactions.groupby('article_id').count()
def f(x):
    try:
        return article_sales.loc[x]['customer_id']
    except:
        return 0
df_articles['total_sales_2019'] = df_articles['article_id'].apply(f)
df_articles

In [None]:
to_drop = ['product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_code',
 'index_group_no',
 'section_no',
 'garment_group_no']
df_articles.drop(columns=to_drop, axis=1, inplace=True)

In [None]:
df_articles.sort_values(by='total_sales_2019', ascending=False).head(10)

The top two bestsellers are denim `Trousers`, with black colour having more sales than light blue. 8 out of the top 10 topsellers are `Black`. 

From before, there are 105542 distinct `article_id` in the articles dataset. Hence we define the **top 1000** (~1%) articles with most sales as a **bestseller**. 

In [None]:
bestsellers = df_articles.sort_values(by='total_sales_2019', ascending=False).head(1000)

In [None]:
plt.figure(figsize=(15,5))
g = sns.countplot(x="colour_group_name", #Show count of observations
                  data=bestsellers,
                  palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('Color Group of Bestsellers on H&M')
plt.show(g)

`Black` is indeed the most popular colour.

In [None]:
plt.figure(figsize=(15,5))
g = sns.countplot(x="garment_group_name", #Show count of observations
                  #data=df_articles[df_articles['total_sales_2019'] >= 8000],
                  data=df_articles,
                  palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('All Articles on H&M')
plt.show(g)
plt.figure(figsize=(15,5))
h = sns.countplot(x="garment_group_name", #Show count of observations
                  data=bestsellers,
                  #data=df_articles,
                  palette="pastel")
h.bar_label(h.containers[0])
h.tick_params(axis='x', rotation=90)
plt.title('Bestselling Articles on H&M')
plt.show(h)

H&M sells a lot of types of `Jersey Fancy` articles, but the top three bestsellers are still mostly `Swimwear`, `Jersey Basic`, and `Trousers`. Besides that, `Accesories` is the number two among all articles, but does not constitute much of the bestsellers.

In [None]:
plt.figure(figsize=(8,5))
g = sns.countplot(x="index_group_name", 
                  data=df_articles,
                  palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('All Articles on H&M')
plt.show(g)
plt.figure(figsize=(8,5))
g = sns.countplot(x="index_group_name", 
                  data=bestsellers,
                  palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('Bestselling Articles on H&M')
plt.show(g)

A quick look at the Divided Collection on the official H&M website shows that it is targeted towards feminine styles. We can see that most bestselling articles were designed for the female demographic. It is inferred that most customers that contribute to the sales of H&M articles belong to the female demographic. Besides that, although H&M produces a lot of Baby/Children articles, none of them made it to the bestsellers.

### Who contributes to the bestsellers? 
Let's see if there is a certain age group that contributes mostly to the sales of the bestselling articles.

In [None]:
bestsellers_transactions = df_transactions[df_transactions['article_id'].isin(bestsellers['article_id'])]
bestsellers_contributors = df_customers[df_customers['customer_id'].isin(bestsellers_transactions['customer_id'])]
bestsellers_contributors.head()

In [None]:
# age of customers
g = sns.histplot(df_customers['age'], #Plot univariate distribution
                 kde=False)
plt.title('Age of All Customers')
plt.show(g)
# age of customers who bought the bestsellers
g = sns.histplot(bestsellers_contributors['age'], #Plot univariate distribution
                 kde=False)
plt.title('Age of Customers Who Bought Bestsellers')
plt.show(g)

Comparing the age distribution of all customers and that of customers who bought the bestsellers, the distributions look similar. The bestsellers were catering to **all ages** of the customer base. This may be the reason why they became bestsellers.

### How are the bestsellers priced?
*NOTE: The price data is transformed from the original currency values, so the values do not mean the actual price in any currency. The data providers did not provide information on how the price data is transformed.*

In [None]:
# drop duplicates such that each article at each price point is only considered once
df_transactions_single = df_transactions.drop_duplicates()

# price
g = sns.boxplot(x=df_transactions_single['price'])
plt.title('Price of All Articles')
plt.show(g)

g = sns.boxplot(x=df_transactions_single[~df_transactions_single['article_id'].isin(bestsellers['article_id'])]['price'])
plt.title('Price of Non-Bestselling Articles')
plt.show(g)

g = sns.boxplot(x=df_transactions_single[df_transactions_single['article_id'].isin(bestsellers['article_id'])]['price'])
plt.title('Price of Bestselling Articles')
plt.show(g)


Duplicates of transactions are removed so that every article at each price point is only considered once in the analysis of price distribution. The price distribution of non-bestselling articles is similar to all transactions, that is positively skewed. Most bestselling articles are the relatively **cheaper** offerings. None of the more expensive items (`price` > 0.25) made it to the bestsellers.

### Which articles generate the most revenue? Are they the cheaper bestselling items or more expensive items?

Since the prices are not the actual currency values and no cost data is provided, we cannot calculate and analyse the profits, so we will stick to analysing revenue. The absolute value of the revenues do not mean anything, it is just used for comparison among articles.

In [None]:
article_revenue = df_transactions.groupby('article_id').sum()
article_revenue.drop(columns=['customer_id', 'sales_channel_id', 'year', 'month', 'day'], inplace=True)
article_revenue_ranking = article_revenue.sort_values(by='price', ascending=False)
article_revenue_ranking.rename(columns={'price':'total_revenue_2019'}, inplace=True)
article_revenue_ranking.head(10)

The first two articles actually correspond to the top 2 bestsellers discovered earlier.

From before, there are 105542 distinct `article_id` in the articles dataset. Hence we define the **top 1000** (~1%) articles bringing in the most revenue as the **top_performers**. 

In [None]:
# add 2019 revenue data to df_articles
article_revenue = df_transactions.groupby('article_id').sum()
def f(x):
    try:
        return article_revenue.loc[x]['price']
    except:
        return 0
df_articles['total_revenue_2019'] = df_articles['article_id'].apply(f)
df_articles

In [None]:
top_performers = df_articles.sort_values(by='total_revenue_2019', ascending=False).head(1000)
# New column: 1 if article is bestseller AND top_performer, else 0
df_articles['bestseller_revenue'] = df_articles['article_id'].isin(top_performers['article_id']).astype(int) * df_articles['article_id'].isin(bestsellers['article_id']).astype(int)

In [None]:
print("Number of articles that are both bestselling and top performing:", df_articles['bestseller_revenue'].sum())

**58.7%** of the bestsellers are also top performing (i.e. contributes to the top ~1% of revenue).

In [None]:
# plt.figure(figsize=(8,5))
# g = sns.countplot(x="index_group_name", 
#                   data=df_articles,
#                   palette="husl")
# g.bar_label(g.containers[0])
# g.tick_params(axis='x', rotation=90)
# plt.title('All Articles on H&M')
# plt.show(g)
plt.figure(figsize=(8,5))
g = sns.countplot(x="index_group_name", 
                  data=top_performers,
                  palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('Top Performing Articles on H&M')
plt.show(g)

Again, similar to the bestsellers, most customers that contribute to the revenue of H&M articles belong to the female demographic. Although H&M produces a lot of Baby/Children articles, none of them made it to the top performers. 

The distribution is similar to the bestsellers. Let's see if the sales and revenues are correlated.

In [None]:
#df['A'].corr(df['B'])
df_articles['total_sales_2019'].corr(df_articles['total_revenue_2019'])

Unsurprisingly, they are strongly correlated.

### How does each group of articles contribute to the revenue of H&M?
Let us look at the breakdown of revenue by `index_group_name`.

In [None]:
index_group_revenue = df_articles[['index_group_name','total_revenue_2019']].groupby('index_group_name').total_revenue_2019.sum().reset_index()
index_group_revenue

In [None]:
plt.figure(figsize=(8,8))
colors = sns.color_palette('pastel')
plt.pie(x=index_group_revenue['total_revenue_2019'], labels=index_group_revenue['index_group_name'], colors=colors, autopct='%1.1f%%')
plt.title('Revenue Contribution by Index Group')
plt.show()

Ladieswear and Divided, which are both feminine styles, contribute to **88.5%** of revenue in 2019.

### Focusing on the #1 bestseller / top performer, how do its price and sales vary along time?

In [None]:
list(df_articles[df_articles['article_id'] == 706016001]['detail_desc'])[0]

In [None]:
num1_transactions = df_transactions[df_transactions['article_id'] == 706016001]
num1_transactions.tail()

Interestingly, the price of the same article on the same day could be different.

We calculate the daily average price for further analysis.

In [None]:
num1_day = num1_transactions.groupby(['t_dat']).mean()
#num1_day_sales = num1_transactions.groupby(['t_dat']).count()
num1_day['daily_sales'] = num1_transactions.groupby(['t_dat']).count()['customer_id']
num1_day

In [None]:
article_id_of_interest = 706016001

plt.rc('figure', figsize=(25, 8))   # this is to overwrite default aspect of graph to make x-axis longer

fig, ax1 = plt.subplots()
ax1.plot(num1_day.index, num1_day['price'], color='#2ca02c')
ax1.set_ylabel('Daily Average Price (scaled)', color='#2ca02c')
ax1.tick_params('y', colors='#2ca02c')
ax1.set_ylim(bottom=max(num1_day['price'].min()-num1_day['price'].mean(), 0))
ax2 = plt.twinx()
ax2.bar(num1_day.index, num1_day['daily_sales'], color='#17becf')
ax2.set_ylabel('Daily Sales', color='#17becf')
ax2.tick_params('y', colors='#17becf')
ax2.set_ylim(top=num1_day['daily_sales'].max()+num1_day['daily_sales'].mean())
plt.title('Daily Sales and Prices of article {}'.format(article_id_of_interest))
plt.show()

In [None]:
num1_day.sort_values(by='daily_sales', ascending=False).head()

We can see that price dips in Autumn and Winter results in strong peaks in the daily sales. Price dips in other months were less impactful on the daily sales. The lowest price dip occured during Black Friday, which had significant sales.

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='price',y='daily_sales',data=num1_day)
plt.show()

There are very little datapoints at the lower prices but it is obvious that lower prices generate more sales when price < 0.03.

### What about other articles? How are their sales and price trends different or similar to the #1 bestseller?

In [None]:
import os
import matplotlib.image as mpimg

def view_article_trend(article_id_of_interest):
    
    # find image of article
    file_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/images/0{}/0{}.jpg'.format(str(article_id_of_interest)[:2],article_id_of_interest)
    if os.path.exists(file_path):
        img = mpimg.imread(file_path)
        #plt.imshow(img)
    else:
        img = None

    # calculate the daily sales and average prices 
    article_daily_sales = df_transactions.groupby(['article_id','t_dat'])['article_id'].count()
    article_daily_sales = article_daily_sales.reset_index(name='daily_sales')
    article_daily_price = df_transactions.groupby(['article_id','t_dat'])['price'].mean()
    article_daily_price = article_daily_price.reset_index(name='avg_price')
    article_daily_price = article_daily_price[article_daily_price['article_id'] == article_id_of_interest]
    article_daily_sales = article_daily_sales[article_daily_sales['article_id'] == article_id_of_interest]
    article_daily_sales['avg_price'] = article_daily_price['avg_price']

    # plot daily price vs sales
    fig, (ax1, ax2) = plt.subplots(1, 2)
    sns.scatterplot(x='avg_price',y='daily_sales',data=article_daily_sales, ax=ax1)
    ax1.title.set_text('Price vs Sales of article {}'.format(article_id_of_interest))
    # display image of article
    if img is not None:
        ax2.imshow(img)
        ax2.title.set_text('Image of article {}'.format(article_id_of_interest))
    else:
        ax2.title.set_text('No image found.')
        plt.show()

    # plot temporal change in price and sales
    fig, ax1 = plt.subplots()
    ax1.plot(article_daily_sales['t_dat'], article_daily_sales['avg_price'], color='#2ca02c')
    ax1.set_ylabel('Daily Average Price (scaled)', color='#2ca02c')
    ax1.tick_params('y', colors='#2ca02c')
    ax1.set_ylim(bottom=max(article_daily_sales['avg_price'].min()-article_daily_sales['avg_price'].mean(), 0))
    ax2 = plt.twinx()
    ax2.bar(article_daily_sales['t_dat'], article_daily_sales['daily_sales'], color='#17becf')
    ax2.set_ylabel('Daily Sales', color='#17becf')
    ax2.tick_params('y', colors='#17becf')
    ax2.set_ylim(top=article_daily_sales['daily_sales'].max()+article_daily_sales['daily_sales'].mean())
    plt.title('Daily Sales and Prices of article {}'.format(article_id_of_interest))
    plt.show()

    print('Article {}:'.format(article_id_of_interest))
    print(list(df_articles[df_articles['article_id']==article_id_of_interest]['detail_desc'])[0])

In [None]:
view_article_trend(108775015) # [Insert article_id that we are interested in exploring]

For article 108775015, 
- From the scatterplot we can see that reducing its price does not increase its sales. 
- Looking at the daily trends, firstly we see that sales were higher in the first half of the year. The price dips in February till May cause spikes in sales. 
- Price dips occurred most significantly in the later part of the year, which is the winter season in the Northen Hemisphere. Looking at the image of the article, it is thus not surprising that sales were lower when the weather is cold.
- One thing unusual is the low sales during summer (mid-May till August). Further data regarding marketing strategy, actual location of buyers (instead of encoded zip codes that do not give any geographical insights) may be helpful.

# Next Steps
Some directions that I hope to work on moving forward:
1. Frequent Pattern Mining - "Customers who bought article A freqeuntly also bought ... "
2. Recommender System - Collaborative Filtering, Content-Based Recommendation