# Overview

This is such a fun dataset for exploration! It has images, it has product details, and it has transaction history. I made this visualization snippet to get more familiar with the collection of articles / items from H&M, as well as get a feel of what the products are and perhaps a glance on customer profile.

Big thanks to the following notebook(s) that gave a lot of ideas and implementation steps for the snippets:
- https://www.kaggle.com/vanguarde/h-m-eda-first-look

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import plotly.express as px
import matplotlib.image as mpimg

import warnings 
warnings.filterwarnings('ignore')

# Overview of Data

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
print(articles.columns)
articles.head()

In [None]:
print(customers.columns)
customers.head()

In [None]:
print(transactions.columns)
transactions.head()

In [None]:
N_SAMPLE = 1000000
transactionsSample = transactions.sample(n=N_SAMPLE)

In [None]:
article_volume = transactionsSample.groupby('article_id')['t_dat'].count().sort_values(ascending=False).reset_index()
article_volume.columns = ['article_id','volume']
article_volume.head()

In [None]:
articles_withVolume = pd.merge(articles,article_volume,on=['article_id'],how='left')
articles_withVolume.head(2)

# Treemap to get better feel of different product categories and how large they are
Comparing treemap based on # articles and sales volume

In [None]:
articles['ones'] = 1.0  # to count number of rows
px.treemap(articles, path=['index_group_name','index_name','product_group_name', 'product_type_name'],
                values='ones', title='Tree Map based on Article ID')
# fig.show()

In [None]:
fig = px.treemap(articles_withVolume, path=['index_group_name','index_name','product_group_name', 'product_type_name'],
                values='volume', title='Tree Map based on Sales Volume')
fig.show()

In [None]:
fig = px.treemap(articles_withVolume, path=['index_group_name','section_name','product_group_name', 'product_type_name'],
                values='volume', title='Tree Map based on Sales Volume')
fig.show()

# Functions

### Function: Given an article_id, get the volume and price history (weekly)

In [None]:
def getArticlePriceHistory(article_id):
    dfTrxArticle = transactions[transactions.article_id == article_id]
    dfTrxArticle['priceK'] = dfTrxArticle.price * 1000
    dfTrxArticle['t_dat'] = pd.to_datetime(dfTrxArticle['t_dat'])
    series_mean = dfTrxArticle[['t_dat', 'priceK']].groupby(pd.Grouper(key="t_dat", freq='W')).mean()
    series_stdev = dfTrxArticle[['t_dat', 'priceK']].groupby(pd.Grouper(key="t_dat", freq='W')).std().fillna(0)
    series_volume = dfTrxArticle[['t_dat', 'priceK']].groupby(pd.Grouper(key="t_dat", freq='W')).count().fillna(0)
    dfArticlePriceHistory = pd.DataFrame({'price_avg':series_mean['priceK'],'price_std':series_stdev['priceK'],'volume':series_volume['priceK']},index=series_volume.index)
    dfArticlePriceHistory['lower'] = dfArticlePriceHistory.price_avg - 2 * dfArticlePriceHistory.price_std
    dfArticlePriceHistory['upper'] = dfArticlePriceHistory.price_avg + 2 * dfArticlePriceHistory.price_std
    return dfArticlePriceHistory

### Function: Given an article_id, get the img object

In [None]:
def getImgFromArticle(article_id):
    subfolder = '0'+str(article_id)[:2]
    filename = '0'+str(article_id)+'.jpg'
    filename_root = '../input/h-and-m-personalized-fashion-recommendations/images/'
    filename_path = filename_root + subfolder + '/' + filename
    img = mpimg.imread(filename_path)
    return img

### Function: Get the article info as dictionary for an article_id

In [None]:
def getArticleInfo(article_id):
    dictArticleInfo = articles[articles.article_id==article_id].reset_index().iloc[0].to_dict()
    return dictArticleInfo

### Function: Visualize one set of info group for an article_id

In [None]:
def visualizeRowArticle(article_id):
    fig, axes = plt.subplots(1,2,figsize=(15,5))
    try:
        imgSample = getImgFromArticle(article_id)
        axes[0].imshow(imgSample)
        axes[0].set_title('Product Image')
    except:
        axes[0].set_title('Product Image is Missing')
    dfArticlePriceHistory = getArticlePriceHistory(article_id)
    dictArticleInfo = getArticleInfo(article_id)   
    axes[1].plot(dfArticlePriceHistory.price_avg, color='red', label='Prices')
    axes[1].fill_between(dfArticlePriceHistory.index, dfArticlePriceHistory.lower, dfArticlePriceHistory.upper, color='grey',alpha=0.1)
    axes[1].set_ylabel('Price')
    axes[1].legend(loc=2) # upper left
    ax1_twin = axes[1].twinx()
    ax1_twin.bar(x=dfArticlePriceHistory.index,height=dfArticlePriceHistory.volume, color='blue', label='Volume')
    ax1_twin.set_ylabel('Volume')        
    ax1_twin.legend(loc=1) # upper right
    axes[1].set_title('Historical Price Chart')
    plt.suptitle(dictArticleInfo['prod_name'] + ':\n' + dictArticleInfo['detail_desc'],horizontalalignment='left',x=0.1, y=1.05)

# Function: Divider title between sections of plotting

In [None]:
def createDividerTitle(title='Chart',color='mistyrose'):
    fig,axes = plt.subplots(figsize=(20,1), facecolor=color)
    axes.axis('off')
    plt.text(0.01,0.5,title,dict(size=20))

# Run functions to explore a couple of products (articles)

In [None]:
listArticle = [736489010,505221004,610776002]
for article_id in listArticle:
    visualizeRowArticle(article_id)

## Visualize top selling articles per year

In [None]:
transactionsSample = transactions.sample(n=100000)
transactionsSample['t_dat'] = pd.to_datetime(transactionsSample['t_dat']) 
transactionsSample['year'] = pd.DatetimeIndex(transactionsSample['t_dat']).year
transactionsSample.head()

In [None]:
groupedTrx = transactionsSample.groupby(['year','article_id'])['customer_id'].count().reset_index()
groupedTrx.columns = ['year','article_id','count']

In [None]:
years = groupedTrx.year.unique()
for year in years:
    dfYear = groupedTrx[groupedTrx.year==year]
    dfYear = dfYear.sort_values(by='count',ascending=False)
    topArticleId = dfYear[:10].article_id.values    
    titleText = "Top Articles in Year {}".format(year)
    createDividerTitle(title=titleText,color='mistyrose')
    for article_id in topArticleId:
        visualizeRowArticle(article_id)

## Visualize top products for each Index category, based on 2020 Sales

In [None]:
transactionsSample.head()

In [None]:
# Get top index name
article_volume2020 = transactionsSample[transactionsSample['year']==2020].groupby('article_id')['t_dat'].count().sort_values(ascending=False).reset_index()
article_volume2020.columns = ['article_id','volume']
articles_withVolume2020 = pd.merge(articles,article_volume2020,on=['article_id'],how='left')
dfTopIndex = articles_withVolume2020.groupby('index_name')['volume'].sum().sort_values(ascending=False)
dfTopIndex.head(10)

In [None]:
index = 'Menswear'
dfTopArticleID = articles_withVolume2020[articles_withVolume2020.index_name==index].sort_values(by='volume',ascending=False)
listTopArticleID = dfTopArticleID.head(5)['article_id'].values
listTopArticleID

In [None]:
index = 'Ladieswear'
dfTopArticleID = articles_withVolume2020[articles_withVolume2020.index_name==index].sort_values(by='volume',ascending=False)
listTopArticleID = dfTopArticleID.head(5)['article_id'].values
listTopArticleID

In [None]:
articles_withVolume2020.index_name.unique()

In [None]:
colorIndex = {}
for index in articles_withVolume2020.index_name.unique():
    colorIndex[index] = 'lightgrey'
colorIndex['Ladieswear'] = 'lightcoral'
colorIndex['Ladies ACcessories'] = 'lightcoral'
colorIndex['Menswear'] = 'lightskyblue'
colorIndex['Lingeries/Tights'] = 'purple'

In [None]:
for index in dfTopIndex.head(10).index:
    dfTopArticleID = articles_withVolume2020[articles_withVolume2020.index_name==index].sort_values(by='volume',ascending=False)
    listTopArticleID = dfTopArticleID.head(5)['article_id'].values
    titleText = "Top Articles in Index {} for Year 2020".format(index)
    createDividerTitle(title=titleText,color=colorIndex[index])
    for article_id in listTopArticleID:
        visualizeRowArticle(article_id)       