In [None]:
import pandas as pd
import numpy as np
import os
from glob import glob
from tqdm import tqdm
import lightgbm as lgbm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import seaborn as sns
from matplotlib import pyplot as plt
import random
from PIL import Image
import math
import itertools
from plotly.graph_objects import treemap
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.subplots as sp
import gc
from collections import Counter

def show_clear_plt():
    plt.tight_layout()
    plt.show()
    plt.clf()


sns.set(font_scale=1.5)
sns.set_style(style='darkgrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['legend.facecolor'] = 'white'

def reduce_memory_usage(df, columns, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

The
aim
of
this
notebook is to
provide
some
quick
EDA of the H&M competition data
by
producing
a
number
of
graphs / outputs
with limited comments. 

This should provide an impression of the data and some examples and understanding of 
the article (product) categorisations and transactions.

If
you
happen
to
be
looking
at
this
notebook and spot
any
mistakes, errors, misconceptions
etc, or any
other
problem
please
feel
free
to
post in comments
section.


# Setup

In [None]:
class CONFIG:
    KAGGLE = os.path.exists('../input/h-and-m-personalized-fashion-recommendations/')

    if KAGGLE:
        PATH = '../input/h-and-m-personalized-fashion-recommendations/'
        print('running on Kaggle')
    else:
        PATH = 'NA'
        print('not running on Kaggle')

    DEBUG = False
    DEBUG_PC = 0.1

    print(f'debugging / reduce data rows is {DEBUG}')

    EXAMPLE_LIMIT = 10

In this notebook am not analysing the submission file

In [None]:
customers = pd.read_csv(CONFIG.PATH + 'customers.csv')
articles = pd.read_csv(CONFIG.PATH + 'articles.csv')
transactions = pd.read_csv(CONFIG.PATH + 'transactions_train.csv',
                           parse_dates=['t_dat'])

#reduce memory usage
customers = reduce_memory_usage(customers, customers.columns)
articles = reduce_memory_usage(articles, articles.columns)
transactions = reduce_memory_usage(transactions, transactions.columns)

print('dataframe shapes, customes / articles / transactions')
print(customers.shape, articles.shape, transactions.shape)

# Data subsampling

(this reduces data size if DEBUG selected in CONFIG)

In [None]:
if CONFIG.DEBUG:
    random.seed(42)

    # subsample the articles
    unique_articles = articles['article_id'].unique().tolist()
    sample_articles = random.sample(unique_articles, int(CONFIG.DEBUG_PC * len(unique_articles)))
    print(f'number of sample articles {len(sample_articles)}')

    print(articles.shape)
    articles = articles[articles['article_id'].isin(sample_articles)].reset_index(drop=True)
    print(articles.shape)

    # subsample the customers
    unique_customers = customers['customer_id'].unique().tolist()
    sample_customers = random.sample(unique_customers, int(CONFIG.DEBUG_PC * len(unique_customers)))
    print(f'number of sample customers {len(sample_customers)}')

    print(customers.shape)
    customers = customers[customers['customer_id'].isin(sample_customers)].reset_index(drop=True)
    print(customers.shape)

    print(f'original train transactions shape {transactions.shape}')
    transactions = transactions[(transactions['customer_id'].isin(sample_customers)) &
                                (transactions['article_id'].isin(sample_articles))].reset_index(drop=True)

    print(f'reduced train transactions shape {transactions.shape}')
    
else:
    print('running with all train data')

# Explore Articles

In [None]:
print('Article data columns')
print(articles.columns.tolist())

In [None]:
print(f'there are {articles.shape[0]} rows in the articles data')
print(' ')
for c in articles.columns:
    print(f'for {c} there are {articles[c].nunique()} unique entries')

Numbers of unique entries by column

In [None]:
fig, axes = plt.subplots(figsize=(10, 9))
plt.barh(y=articles.nunique().index,
         width=articles.nunique().values,
         color='Green')
plt.title('Count of uniques for article categories')
show_clear_plt()

In [None]:
fig, axes = plt.subplots(figsize=(12, 9))
plt.barh(y=articles.nunique().index[articles.nunique()<500],
         width=articles.nunique().values[articles.nunique()<500],
         color='Green')
plt.title('Count of uniques for article categories (lower count categories)')
show_clear_plt()

Top descriptions in each category (sorted by the number of articles)

In [None]:
#categories more related to department / type of product
count_columns = [
    'product_type_name',
    'product_group_name',
    'department_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name',
]
fig, axes = plt.subplots(ncols=2,
                         nrows=len(count_columns),
                         figsize=(15, len(count_columns) * 5),
                        
                         sharey='row',)
                      #  sharex='col')

for count, cc in enumerate(count_columns):
    vc = articles[cc].value_counts() / len(articles) * 100
    vc = vc.sort_values(ascending=False)
    vc = vc[:CONFIG.EXAMPLE_LIMIT]

    axes[count, 0].barh(width=vc.values,
                 y=vc.index,
                 color='Green',
                 )

    axes[count, 1].barh(width=vc.values.cumsum(),
                 y=vc.index,
                 color='Green',
                 )

    axes[count, 0].set_xlim(0, 50)
    axes[count, 1].set_xlim(0, 100)
    axes[count, 0].set_title(f'% of total {cc}')
    axes[count, 1].set_title(f'cumulative % {cc}')
    
show_clear_plt()

Categories related to colour / design

In [None]:
#columns related to colour or pattern/design
count_columns = [
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
]

fig, axes = plt.subplots(ncols=2,
                         nrows=len(count_columns),
                         figsize=(15, len(count_columns) * 5),
                         sharey='row',
                        )

for count, cc in enumerate(count_columns):
    vc = articles[cc].value_counts() / len(articles)  * 100
    vc = vc[:CONFIG.EXAMPLE_LIMIT]

    axes[count, 0].barh(width=vc.values,
                 y=vc.index,
                 color='Green',
                 )

    axes[count, 1].barh(width=vc.values.cumsum(),
                 y=vc.index,
                 color='Green',
                 )

    axes[count, 0].set_xlim(0, 50)
    axes[count, 1].set_xlim(0, 100)
    axes[count, 0].set_title(f'% of total {cc}')
    axes[count, 1].set_title(f'cumulative % {cc}')
    
show_clear_plt()

Overlaps between some of the categories

In [None]:
#heatmaps to examine the correlation between categories
combo_columns = [
    ['product_type_name',
     'product_group_name', ],
    ['section_name',
     'garment_group_name', ],
    ['product_type_name',
     'graphical_appearance_name', ],
    ['product_type_name',
     'department_name', ],
    ['product_type_name',
     'index_name', ],
    ['colour_group_name',
     'perceived_colour_value_name', ],
    ['colour_group_name',
     'graphical_appearance_name', ],
    ['colour_group_name',
     'product_group_name', ],

]

for cc in combo_columns:
    gp = articles.groupby(cc)['article_id'].count().unstack(cc[1]) / len(articles) * 100

    #sort by most common entries in each category
    gp = gp.loc[gp.sum(axis=1).sort_values(ascending=False).index.tolist(),
                gp.sum(axis=0).sort_values(ascending=False).index.tolist()
    ]
    #select examples (most common)
    gp = gp.iloc[:CONFIG.EXAMPLE_LIMIT, :CONFIG.EXAMPLE_LIMIT]

    fig, axes = plt.subplots(figsize=(15, max(6, int(len(gp) / 1))))
    sns.heatmap(gp,
                annot=True,
                fmt=".1f",
                linewidths=1,
                cmap='Greens')
    plt.yticks(rotation=0)
    plt.title(f'percentage of data by columns {cc}')
    show_clear_plt()

# Explore Customers

In [None]:
print('customers data shape, columns, data types')
print(customers.shape)
print(customers.columns.tolist())
print(customers.dtypes)

In [None]:
# rename column to make it easier to understand for EDA
customers = customers.rename(columns={'FN': 'fashion_news'})

print(f'there are {customers.shape[0]} rows in the customers data')
print(' ')
for c in customers.columns:
    print(f'for {c} there are {customers[c].nunique()} unique entries and {customers[c].isna().sum()} NAN')

Group the ages into brackets

In [None]:
# fill the NAN and tidy up the entries to make the data clearer to understand
customers['fashion_news'] = np.where(customers['fashion_news'].isna(), 'No', 'Yes')
customers['Active'] = np.where(customers['Active'].isna(), 'No', 'Yes')
customers['club_member_status'] = customers['club_member_status'].fillna('No data')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('No data')


#group the age into brackets of 10 years for convenience in analysis
customers['age'] = customers['age'].fillna(value=-1)
customers['age_decade'] = (customers['age'] // 10) * 10
customers['age_decade'] = [f'{int(x)}-{int(x)+10}' for x in customers['age_decade']]
customers['age_decade'].value_counts()

In [None]:
# show distribution by age bracket
sns.countplot(x=customers['age_decade'], color='Green',
              order=sorted(customers['age_decade'].unique()))
plt.title('Age Distribution of Customers, (-10-0) = No Data')
show_clear_plt()

In [None]:
sns.countplot(x=customers['age_decade'], color='Green',
              order=sorted(customers['age_decade'].unique()),
              hue=customers['Active'],
              palette='tab10',
              )
plt.title('Age Distribution of Customers and Activity, (-10-0) = No Age Data')
show_clear_plt()

In [None]:
sns.countplot(x=customers['age_decade'], color='Green',
              order=sorted(customers['age_decade'].unique()),
              hue=customers['fashion_news_frequency'],
              palette='tab10',
              )
plt.title('Age Distribution of Customers and fashion_news_frequency, (-10-0) = No Age Data')
show_clear_plt()

In [None]:
sns.countplot(x=customers['age_decade'], color='Green',
              order=sorted(customers['age_decade'].unique()),
              hue=customers['club_member_status'],
              palette='tab10',
              )
plt.title('Age Distribution of Customers and club_member_status, (-10-0) = No Age Data')
show_clear_plt()

In [None]:
customers['transaction_count'] = customers['customer_id'].map(transactions['customer_id'].value_counts())
age_transactions = customers.groupby(['age_decade'])['transaction_count'].agg(['count', 'sum'])
age_transactions['transaction_per_cust'] = age_transactions['sum'] / age_transactions['count']

fig,axes=plt.subplots()
ax2=axes.twinx()
axes.bar(x=age_transactions.index,
         height=age_transactions['sum'],
         color='Green')


ax2.plot(age_transactions.index,
         age_transactions['transaction_per_cust'],
          linewidth=5,
         color='Red'
        )

ax2.set_ylim(0,)
plt.title('# of customers (green) vs transactions per customer (red)')
axes.set_ylabel('# customers')
ax2.set_ylabel('transactions per customer')
show_clear_plt()

In [None]:
combo_columns = [
    'fashion_news',
    'Active',
    'club_member_status',
    'fashion_news_frequency',
]

for cc in itertools.combinations(combo_columns, 2):
    cc = list(cc)
    gp = customers.groupby(cc)['customer_id'].count().unstack(cc[1]) / len(customers) * 100

    #sort by most common entries in each category
    gp = gp.loc[gp.sum(axis=1).sort_values(ascending=False).index.tolist(),
                gp.sum(axis=0).sort_values(ascending=False).index.tolist()
    ]

    gp = gp.iloc[:CONFIG.EXAMPLE_LIMIT, :CONFIG.EXAMPLE_LIMIT]

    fig, axes = plt.subplots(figsize=(12, max(6, int(len(gp) / 2))))
    sns.heatmap(gp,
                annot=True,
                fmt=".1f",
                linewidths=1,
                cmap='Greens')
    plt.yticks(rotation=0)
    plt.title(f'percentage of data by columns {cc}')
    show_clear_plt()
    

Analyse postcodes - do they tell us anything useful?

In [None]:
postal_codes = customers.groupby(['postal_code'])['customer_id'].count()
print('customers per postcode')
print(postal_codes.shape)
postal_codes.sort_values(ascending=False).head(10)

In [None]:
# most postcodes have only 1 customer, so this may need more work to see if there could
# be anything useful
sns.histplot(postal_codes.values[postal_codes<300], discrete=True)
plt.title('Customers per postcode, for values < 300 (clipped outlier)')
show_clear_plt()

Do customers have >1 postal code?

In [None]:
max_codes = customers.groupby(['customer_id'])['postal_code'].nunique().max()
print(f'max postal codes per customer is {max_codes}')

# Explore Transactions

In [None]:
print('transactions data shape, columns, data types')
print(transactions.shape)
print(transactions.columns.tolist())
print(transactions.dtypes)

In [None]:
print(f'there are {transactions.shape[0]} rows in the transactions data')
print(' ')
for c in transactions.columns:
    print(f'for {c} there are {transactions[c].nunique()} unique entries and {transactions[c].isna().sum()} NAN')

Some data processing

In [None]:
# make naming easier to remember for EDA purposes
channel_dict = {
    1: 'store',
    2: 'online',
}
transactions['sales_channel_name'] = transactions['sales_channel_id'].map(channel_dict)

# add weeks, days, etc
transactions['quarter'] = transactions['t_dat'].dt.quarter
transactions['month'] = transactions['t_dat'].dt.month
transactions['week'] = transactions['t_dat'].dt.isocalendar().week
transactions['weekday'] = transactions['t_dat'].dt.weekday
transactions['day_name'] = transactions['t_dat'].dt.day_name()

# cyclic encode - for the week - for demo later
def cyclic_encode(df, column):
    df[f'{column}_sin'] = np.sin(2 * np.pi * df[column] / df[column].max())
    df[f'{column}_cos'] = np.cos(2 * np.pi * df[column] / df[column].max())
    return df

encode_cols = [
    'week',
]

for ec in encode_cols:
    transactions = cyclic_encode(transactions, ec)

# display example

daily_transactions = transactions.groupby(['t_dat'])[['week_sin', 'week_cos']].mean()

for c in daily_transactions.columns:
    sns.lineplot(x=daily_transactions[c].resample('w').mean().index,
                 y=daily_transactions[c].resample('w').mean().values,
                 linewidth=4)
plt.title('Week cyclic encoding')
plt.legend(daily_transactions.columns.tolist())
plt.ylabel('Cyclic encoding')
show_clear_plt()

Pareto - Customers / Products

20% of most frequent customers are around 65-70% of transactions

20% of higest selling articles are around 80% of transactions

In [None]:
cust_pareto = transactions.groupby(['customer_id'])['customer_id'].count()
prod_pareto = transactions.groupby(['article_id'])['article_id'].count()

fig,axes=plt.subplots(figsize=(14,7), ncols=2)

cust_pareto = cust_pareto.sort_values(ascending=False)
axes[0].scatter(y=cust_pareto.cumsum() / cust_pareto.sum() * 100,
           x=np.ones(cust_pareto.shape).cumsum() / len(cust_pareto) * 100,
               color='Red')
axes[0].set_xlabel('% of customers')
axes[0].set_ylabel('% of transactions')
axes[0].set_title('Customer Pareto')


prod_pareto = prod_pareto.sort_values(ascending=False)
axes[1].scatter(y=prod_pareto.cumsum() / prod_pareto.sum() * 100,
           x=np.ones(prod_pareto.shape).cumsum() / len(prod_pareto) * 100,
               color='Red')
axes[1].set_xlabel('% of Article')
axes[1].set_ylabel('% of transactions')
axes[1].set_title('Article Pareto')

show_clear_plt()

As we can make 12 predictions per customer, what is the cumulative percentage of the top 12 articles?

It appears to be around 1% of total transactions (though as we are predicting at a customer level, think this is not exactly equivalent to filling in the submission file with a top 12)

In [None]:
plt.barh(width=(prod_pareto.cumsum() / prod_pareto.sum())[:12] * 100,
        y=[str(x) for x in prod_pareto.index[:12]],
        color='Green')

plt.title('Cumulative % of top 12 articles')
show_clear_plt()

Sum of transaction distributions by week, month, day (name)

In [None]:
group_cols = [
    't_dat',  # day
    'week',
    'month',
    'day_name'
]

for g_col in group_cols:
    cust_t_count = transactions.groupby(['sales_channel_name',
                                         g_col])['customer_id'].count().unstack('sales_channel_name').sort_index().fillna(
        value=0)

    for c in cust_t_count.columns:
        sns.kdeplot(cust_t_count[c],
                    linewidth=3)
    plt.title(f'Distribution # of sum of transactions per {g_col} by channel')
    plt.legend(cust_t_count.columns.tolist())
    plt.xlabel(f'{g_col} transaction count')
    show_clear_plt()

Online vs Store by Weekday (name)

We can see that Sunday/Monday are strongest for online % share, while Friday/Saturday are strongest for store share

Online absolute numbers are more steady over the week

In [None]:
# store vs online by weekday
daily_transactions = transactions.groupby(['sales_channel_name',
                                           'day_name'])['customer_id'].count().unstack(
    'sales_channel_name').sort_index().fillna(value=0)
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
         'Sunday']

daily_transactions.loc[order].plot(kind='barh', stacked=True)
plt.title('Total Transactions by Weekday')
show_clear_plt()

daily_transactions = daily_transactions / daily_transactions.sum(axis=1).values.reshape(-1, 1)
daily_transactions.loc[order].plot(kind='barh', stacked=True, width=0.8)
plt.title('% of Weekday Total Transactions')
show_clear_plt()

Transaction trend over time

In [None]:
#weekly transactions by channel
daily_transactions = transactions.groupby(['sales_channel_name',
                                           't_dat'])['customer_id'].count().unstack(
    'sales_channel_name').sort_index().fillna(value=0)

for c in daily_transactions.columns:
    sns.lineplot(x=daily_transactions[c].resample('w').sum().index,
                 y=daily_transactions[c].resample('w').sum().values,
                 linewidth=4)
plt.title('Weekly transaction volumes over time by channel')
plt.legend(cust_t_count.columns.tolist())
plt.ylabel('Weekly transaction count')
show_clear_plt()

In [None]:
for c in daily_transactions.columns:
    sns.lineplot(x=daily_transactions[c].resample('M').sum().index,
                 y=daily_transactions[c].resample('M').sum().values,
                 linewidth=4)
plt.title('Monthly transaction volumes over time by channel')
plt.legend(cust_t_count.columns.tolist())
plt.ylabel('Monthly transaction count')
show_clear_plt()

Transactions per Customer

In [None]:
cust_t_count = transactions['customer_id'].value_counts()
sns.histplot(cust_t_count, discrete=True)
plt.title('Transactions per Customer in Transaction Data, x-axis clipped')
# ignoring outliers
plt.xlim(0, 40)
show_clear_plt()

Order sizes (assume 1 order = all items purchased by a single customer in 1 day)

In [None]:
cust_day_group = transactions.groupby(['customer_id', 't_dat'], as_index=False)['article_id'].count()
print(cust_day_group.shape, transactions.shape)
sns.histplot(cust_day_group['article_id'], discrete=True,
            shrink=0.9)
plt.title('Order Sizes (# articles by customer & day), x-axis clipped')
# ignoring outlier
plt.xlim(0, 15)
show_clear_plt()

Mix of store / online at a customer level

In [None]:
# split by channel, by customer
# relatively few customers have a mix of online and store shopping
cust_t_count = transactions.groupby(['sales_channel_name',
                                     'customer_id', ])['customer_id'].count().unstack(
    'sales_channel_name').sort_index().fillna(value=0)

cust_t_count['pc_online'] = cust_t_count['online'] / cust_t_count[['online', 'store']].sum(axis=1)
sns.histplot(cust_t_count['pc_online'], 
             binwidth=0.1,
            shrink=0.9)
plt.title('Distribution of customer % purchase online')
plt.xlabel('Distribution of customer % purchase online')
show_clear_plt()

In [None]:
# if we look at customers with >2 items there is more of a mix but still 100% online or store in many cases
sns.histplot(cust_t_count['pc_online'][cust_t_count[['online', 'store']].sum(axis=1)>2],
            binwidth=0.1,
            shrink=0.9)
plt.title('Distribution of customer % purchase online for customers with >2 items')
plt.xlabel('Distribution of customer % purchase online for customers with >2 items')
show_clear_plt()

Popular Articles - transactions split by Channel

In [None]:
# split by channel, by article
# is there a difference in most popular articles?

article_dict = dict(zip(articles['article_id'],
                        articles['prod_name']))
article_t_count = transactions.groupby(['sales_channel_name',
                                        'article_id', ])['article_id'].count().unstack(
    'sales_channel_name').sort_index().fillna(value=0)

article_t_count['total_transactions'] = article_t_count[['online', 'store']].sum(axis=1)
article_t_count = article_t_count.sort_values('total_transactions', axis=0, ascending=False)

article_t_count = article_t_count.iloc[:CONFIG.EXAMPLE_LIMIT]
article_t_count.index = [f'{x} - {article_dict[x]}' for x in article_t_count.index]

fig, axes = plt.subplots(figsize=(12, 7))
article_t_count[['online', 'store']].plot(kind='barh', stacked=True,
                                          width=0.8,
                                          ax=axes)
plt.title('Popular articles - Online / Store transactions')
plt.xlabel('transacton counts')
show_clear_plt()

Popular Items - mix by Age Group

In [None]:
age_dict = dict(zip(
    customers['customer_id'],
    customers['age_decade']
))
transactions['age_decade'] = transactions['customer_id'].map(age_dict)
print(f'# rows missing customer age data = {transactions["age_decade"].isna().sum()}')

article_t_count = transactions.groupby(['age_decade',
                                        'article_id', ])['article_id'].count().unstack(
    'age_decade').sort_index().fillna(value=0)
article_t_count['total_transactions'] = article_t_count.sum(axis=1)
article_t_count = article_t_count.sort_values('total_transactions', axis=0, ascending=False)
article_t_count = article_t_count.drop('total_transactions', axis=1)

article_t_count = article_t_count.iloc[:CONFIG.EXAMPLE_LIMIT]
article_t_count.index = [f'{x} - {article_dict[x]}' for x in article_t_count.index]

fig, axes = plt.subplots(figsize=(12, 7))
article_t_count.plot(kind='barh', stacked=True,
                                          width=0.8,
                                          ax=axes)
plt.title('Popular articles - By Customer Age Bracket')
plt.xlabel('transaction counts')
show_clear_plt()

In [None]:
#there is some variation in the age profiles of top sellers
article_t_count = article_t_count / article_t_count.sum(axis=1).values.reshape(-1, 1)
fig, axes = plt.subplots(figsize=(12, 7))
article_t_count.plot(kind='barh', stacked=True,
                                          width=0.8,
                                          ax=axes)
plt.title('Popular articles - By Customer Age Bracket')
plt.xlabel('transaction % mix')
show_clear_plt()

What does it look like if we take a top 5 for 3 different age brackets, rather than a top n overall?

In [None]:
brackets = ['20-30','40-50','60-70']
top_items = []
article_t_count = transactions.groupby(['age_decade',
                                        'article_id', ])['article_id'].count().unstack(
    'age_decade').sort_index().fillna(value=0)
for b in brackets:
    temp = article_t_count.loc[article_t_count.sort_values(b, ascending=False).index[:5].tolist()]

    temp.index = [f'{x} - {article_dict[x]}' for x in temp.index]

    fig, axes = plt.subplots(figsize=(12, 7))
    temp.plot(kind='barh', stacked=True,
                                              width=0.8,
                                              ax=axes)
    plt.title(f'Most Popular articles - For Customer Age Bracket {b}')
    plt.xlabel('transaction counts')
    show_clear_plt()

Do customers buy the same thing more than once (on different days)?

In [None]:
# count of transactions per article, for articles seen in transaction data
article_freq = transactions['article_id'].value_counts().sort_values(ascending=False)

# do customers buy the same thing more than once (on different days)?
prod_customers = transactions.groupby(['customer_id',
                                       'article_id', 't_dat'], as_index=False)['t_dat'].count().groupby(['customer_id',
                                                                                                         'article_id', ],
                                                                                                        as_index=False)[
    't_dat'].count()
prod_customers_ = prod_customers[prod_customers['t_dat'] > 1.0].sort_values('t_dat', ascending=False).reset_index(
    drop=True)
print(f'{len(prod_customers_)} instances found')
prod_customers_.head(10)

Double check the data - yes, this customer purchased the same item on different dates

Some of these examples look a bit odd at a quick glance - may need further investigation

In [None]:
print(article_dict[prod_customers_.loc[0, 'article_id']])
transactions[(transactions['customer_id'] == prod_customers_.loc[0, 'customer_id']) &
             (transactions['article_id'] == prod_customers_.loc[0, 'article_id'])]

It is definitely possible for a customer to buy the same item on different dates

In [None]:
fig, axes = plt.subplots(figsize=(15, 7))
sns.histplot(prod_customers['t_dat'], discrete=True)
plt.title('# of instances when customer when customer purchased same item on multiple days (1 = no repeat purchase)')
show_clear_plt()

Sales trends for popular items - It is clear that some items are popular only during specific periods

Either seasonality, or introduced / removed from range in some cases?

% of sales online vs store shows some significant differences by article

In [None]:
fig, axes = plt.subplots(nrows=CONFIG.EXAMPLE_LIMIT,
                         ncols=1,
                         figsize=(12, 6*CONFIG.EXAMPLE_LIMIT),
                         sharex=True)

for count, a in enumerate(article_freq.index[:CONFIG.EXAMPLE_LIMIT]):
    cust_t_count = transactions[transactions['article_id'] == a].groupby(['sales_channel_name',
                                                                          't_dat'])['customer_id'].count().unstack(
        'sales_channel_name').fillna(value=0).sort_index()


    for c in cust_t_count.columns:
        temp = cust_t_count[c].resample('w').sum()
        sns.lineplot(x=temp.index,
                     y=temp.values,
                     linewidth=5,
                     ax=axes[count])
    axes[count].set_title(f'{article_dict[a]} art_ID {a} Weekly transaction volumes over time by channel')
    axes[count].legend(cust_t_count.columns.tolist())
    axes[count].set_ylabel('Weekly transaction count')
    axes[count].set_xlabel('Date')
show_clear_plt()

In [None]:
del prod_customers, prod_customers_
gc.collect()

# Transaction Locations

What is the impact of Location on store/online?

There is one postcode with a huge number of transactions, primarily Store (not online)

Is this some form of NAN? Needs exploration.

In [None]:
transactions['postal_code'] = transactions['customer_id'].map(dict(zip(customers['customer_id'],
                                                                       customers['postal_code'])))

transactions_locations = transactions.groupby(['postal_code','sales_channel_name'])['customer_id'].count().unstack('sales_channel_name').fillna(value=0)
transactions_locations = transactions_locations.loc[transactions_locations.sum(axis=1).sort_values(ascending=False).index.tolist()].iloc[:CONFIG.EXAMPLE_LIMIT,:]
fig, axes = plt.subplots(figsize=(20, 7))
transactions_locations.plot(kind='barh', stacked=True,
                                          width=0.8,
                                          ax=axes)
plt.title('Common Postcodes - Online / Store transactions')
plt.xlabel('transacton counts')
show_clear_plt()

In [None]:
fig, axes = plt.subplots(figsize=(20, 7))
transactions_locations = transactions_locations / transactions_locations.sum(axis=1).values.reshape(-1,1)
transactions_locations.plot(kind='barh', stacked=True,
                                          width=0.8,
                                          ax=axes)
plt.title('Common Postcodes - % Online / Store transactions')
plt.xlabel('transacton counts')
show_clear_plt()

# Transaction Price

In [None]:
sns.histplot(transactions['price'], 
             binwidth=0.01)
plt.title('Distribution of Price - All Transactions')
show_clear_plt()

In [None]:
sns.histplot(transactions['price'][transactions['sales_channel_name'] == 'online'], binwidth=0.01,
        color='Blue')
sns.histplot(transactions['price'][transactions['sales_channel_name'] == 'store'], binwidth=0.01,
             color='Orange')
plt.legend(['online', 'store'])
plt.title('Distribution of Price by channel')
show_clear_plt()

Not surprisingly, the articles with the highest mean prices are generally not high volume items

In [None]:
#not surprisingly, the articles with the highest mean prices are generally not
#high volume items
articles_prices_volumes = transactions.groupby(['article_id'])['price'].agg(['mean', 'count',
                                                                             'max', 'min', 'std'])
plt.scatter(x=articles_prices_volumes['count'],
            y=articles_prices_volumes['mean'],
            color='Red',
            s=2)
plt.title('Article transaction count vs mean price')
plt.xlabel('article transaction count')
plt.ylabel('article price mean')
show_clear_plt()

Inspect most expensive products

In [None]:
articles_prices_volumes['name'] = articles_prices_volumes.index.map(article_dict)
articles_prices_volumes.sort_values('mean', ascending=False).head(10)

Inspect cheapest products

In [None]:
articles_prices_volumes.sort_values('mean', ascending=True).head(10)

Inspect largest standard deviations in pricing

In [None]:
articles_prices_volumes.sort_values('std', ascending=False).head(10)

Looking at article transaction price / article mean transaction price, there are some outliers far to the right

In [None]:
transactions['article_mean_price'] = transactions['article_id'].map(articles_prices_volumes['mean'])
transactions['article_transaction_price_pc_mean'] = transactions['price'] / transactions['article_mean_price']
sns.histplot(transactions['article_transaction_price_pc_mean'],
            binwidth=0.05)
plt.title('Distribution of transactions - price as a % of article mean transaction price')
plt.xlim(0, 4)
show_clear_plt()

Look at article transaction price / article mean transaction price by channel

In [None]:
sns.histplot(transactions['article_transaction_price_pc_mean'][transactions['sales_channel_name'] == 'online'], binwidth=0.05,
        color='Blue')
sns.histplot(transactions['article_transaction_price_pc_mean'][transactions['sales_channel_name'] == 'store'], binwidth=0.05,
             color='Orange')
plt.legend(['online', 'store'])
plt.title('Distribution of Transaction Price vs Article Mean Transaction Price by channel')
show_clear_plt()

Look at sales vs price trends for some popular items

In [None]:
for a in article_freq.index[:CONFIG.EXAMPLE_LIMIT]:
    cust_t_count = transactions[transactions['article_id'] == a].groupby([
        't_dat'])['price'].agg(['mean', 'count'])
    # print(cust_t_count)
    fig, axes = plt.subplots(figsize=(12, 6))
    ax2 = axes.twinx()

    temp = cust_t_count['mean'].resample('w').mean()
    sns.lineplot(x=temp.index,
                 y=temp.values,
                 linewidth=2,
                 ax=ax2,
                 color='Red')

    temp = cust_t_count['count'].resample('w').sum()
    sns.lineplot(x=temp.index,
                 y=temp.values,
                 linewidth=4,
                 ax=axes,
                 color='Black')

    axes.set_ylabel('Weekly sales')
    ax2.set_ylim(0, )
    ax2.set_ylabel('Weekly mean Price')
    plt.title(f'{article_dict[a]} Weekly price (red) vs volume (black)')
    plt.legend(cust_t_count.columns.tolist())
    plt.xlabel('Date')
    show_clear_plt()       

# Transactions vs Article Features

In [None]:
article_sales = transactions.groupby('article_id')['t_dat'].count()
articles['sales'] = articles['article_id'].map(article_sales).fillna(value=0)
print(f'percent of articles with no transactions {sum(articles["sales"] == 0.0) / len(articles)}')

Contributions to total transactions by category

In [None]:
count_columns = [
    'product_type_name',
    'product_group_name',
    'department_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name',
]

fig, axes = plt.subplots(nrows=len(count_columns),
                         figsize=(12, 6 * len(count_columns)),
                      )

for count, cc in enumerate(count_columns):
    vc = articles.groupby([cc])['sales'].sum() / articles['sales'].sum() * 100
    vc = vc.sort_values(ascending=False)[:CONFIG.EXAMPLE_LIMIT]
    
    axes[count].barh(width=vc.values,
             y=vc.index,
             color='Green')
    axes[count].set_title(f'percentage transaction counts {cc}')
    axes[count].set_xlabel('percent of total transactions')
show_clear_plt()

Contributions to total transactions - categories more related to the colour / design

In [None]:
count_columns = [
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
]
fig, axes = plt.subplots(nrows=len(count_columns),
                         figsize=(12, 6 * len(count_columns)),
                        )

for count, cc in enumerate(count_columns):
    vc = articles.groupby([cc])['sales'].sum() / articles['sales'].sum() * 100
    vc = vc.sort_values(ascending=False)[:CONFIG.EXAMPLE_LIMIT]

    axes[count].barh(width=vc.values,
                     y=vc.index,
                     color='Green')
    axes[count].set_title(f'percentage transaction counts {cc}')
    axes[count].set_xlabel('percent of total transactions')
show_clear_plt()

Seasonality by article category (examples - not all categories)

We can see types of clothing and even some colours (maybe white, pink, green) could be seasonal

There may also be some longer term trends, in colour for example Blue looks less popular in 2020 while Green looks more popular in 2020. It's not clear without further analysis if this is an overall trend or maybe driven by some specific top-selling products.

In [None]:
seasonality_example_columns = [
    'product_group_name',
    'perceived_colour_master_name',
    'garment_group_name',
]

for cc in seasonality_example_columns:
    vc = articles.groupby([cc])['sales'].sum() / articles['sales'].sum()
    vc = vc.sort_values(ascending=False)[:CONFIG.EXAMPLE_LIMIT]

    transactions['article_cat'] = transactions['article_id'].map(dict(zip(articles['article_id'],
                                                                          articles[cc])))

    for v in vc.index.tolist():
        cust_t_count = transactions[transactions['article_cat'] == v].groupby(['t_dat'])[
            'customer_id'].count().sort_index()

        fig, axes = plt.subplots(figsize=(12, 6))

        temp = cust_t_count.resample('w').sum()
        sns.lineplot(x=temp.index,
                     y=temp.values,
                     linewidth=5,
                     color='Black')
        plt.title(f'Category {cc} - {v} Weekly transaction volumes')
        plt.ylabel('Weekly transaction count')
        plt.xlabel('Date')
        show_clear_plt()

Displaying seasonality using cyclic feature with 2 examples

In [None]:
transactions['article_cat'] = transactions['article_id'].map(dict(zip(articles['article_id'],
                                                                          articles['product_type_name'])))
# first example
example = 'Sweater'
cust_t_count = transactions[transactions['article_cat'] == example].groupby(['week',
                                                                             'week_sin',
                                                                             'week_cos'], as_index=False)[
            'customer_id'].count().sort_index()
plt.scatter(x=cust_t_count['week_sin'] * cust_t_count['customer_id'],
            y=cust_t_count['week_cos'] * cust_t_count['customer_id'],
            color='Red',
            s=100)

# second example
example = 'Dress'
cust_t_count = transactions[transactions['article_cat'] == example].groupby(['week',
                                                                             'week_sin',
                                                                             'week_cos'], as_index=False)[
            'customer_id'].count().sort_index()

plt.scatter(x=cust_t_count['week_sin'] * cust_t_count['customer_id'],
            y=cust_t_count['week_cos'] * cust_t_count['customer_id'],
            color='Blue',
            s=100)

# we can see a pattern indicative of summer-winter cycle
plt.legend(['Sweater', 'Dress'])
plt.title('Sweater / Dress sales around year (as cycle)')
plt.xlabel('week of year - sin')
plt.ylabel('week of year - cos')
show_clear_plt()

# Customer purchases by Ladies / Mens wear etc

What can we deduce about customers from the purchases - example, using index_group_name

Are customers (for example) more likely to purchase from Ladieswear in the future, if most past purchases have been from that department?

In [None]:
transactions['index_group_name'] = transactions['article_id'].map(dict(zip(articles['article_id'],
                                                                           articles['index_group_name'])))

order = ['Ladieswear', 'Divided', 'Sport', 'Baby/Children',  'Menswear', ]

customer_split = transactions.groupby(['customer_id', 'index_group_name'])['article_id'].count().unstack('index_group_name').fillna(value=0)[order]
customer_split.head(10)

A simple analysis is just to look at correlations

In [None]:
corr = customer_split.corr()
fig,axes=plt.subplots(figsize=(13,7))
sns.heatmap(corr,
           annot=True,
           fmt=".2f",
            cmap='seismic_r',
            vmin=0, 
            vmax=1,
           linewidth=1)
plt.title('Customer Purchase Correlations')
show_clear_plt()

Look at customers with >1 purchase and <41 purchases

In [None]:
corr = customer_split[(customer_split.sum(axis=1)>1) & 
                     (customer_split.sum(axis=1)<41)].corr()
fig,axes=plt.subplots(figsize=(13,7))
sns.heatmap(corr,
           annot=True,
           fmt=".2f",
            cmap='seismic_r',
            vmin=-0.2, 
            vmax=1,
           linewidth=1)
plt.title('Customer Purchase Correlations, for customers buying 2-40 items')
show_clear_plt()

Check correlation if we reduce to a Y/N (0 or 1)

In [None]:
customer_split_ = customer_split.copy()
customer_split_[:] = np.where(customer_split_[:]==0, 0, 1)
corr = customer_split_.corr()
fig,axes=plt.subplots(figsize=(13,7))
sns.heatmap(corr,
           annot=True,
           fmt=".2f",
            cmap='seismic_r',
            vmin=-0.2, 
            vmax=1,
           linewidth=1)
plt.title('Customer Purchase Correlations, binary Y/N')
show_clear_plt()

Check correlation if we reduce to a Y/N (0 or 1) and filter to customers buying 2-40 items

In [None]:
corr = customer_split_[(customer_split.sum(axis=1)>1) & 
                     (customer_split.sum(axis=1)<41)].corr()
fig,axes=plt.subplots(figsize=(13,7))
sns.heatmap(corr,
           annot=True,
           fmt=".2f",
            cmap='seismic_r',
            vmin=-0.2, 
            vmax=1,
           linewidth=1)
plt.title('Customer Purchase Correlations, binary Y/N, for customers buying 2-40 items')
show_clear_plt()

# Items Commonly Bought by the same Customer

Review some examples (based on a subset of the data) of items most commonly found together within a customer's purchase history

In [None]:
#to avoid out of memory, need to filter to most common items (pending finding a more efficient approach)
del customers
gc.collect()
transactions['article_total'] = transactions['article_id'].map(transactions['article_id'].value_counts())
transactions['customer_total'] = transactions['customer_id'].map(transactions['customer_id'].value_counts())

In [None]:
transactions = reduce_memory_usage(transactions, transactions.columns)

To avoid out of memory, need to filter to most common items (pending finding a more efficient approach)

In [None]:
FILTER = 5000
CUST_FILTER = 3
pc_rows = sum((transactions['article_total']>FILTER) & 
             (transactions['customer_total'] > CUST_FILTER)) / len(transactions)

print(f'% of rows to include based on product volume filter - {pc_rows}')

In [None]:
cust_hist = transactions[(transactions['article_total']>FILTER) & 
                        (transactions['customer_total'] > CUST_FILTER)].groupby(['customer_id'])['article_id'].apply(list)
print(len(cust_hist))

sorted_lists = [sorted(list(set(x))) for x in cust_hist.values]

for count, l in enumerate(tqdm(sorted_lists)):
    sorted_lists[count] = [f'{a}_{b}' for a, b in itertools.combinations(l, 2) if a != b]
    
all_items = [item for sublist in sorted_lists for item in sublist] 
print(len(all_items))
counter = Counter(all_items)

Lets look at what these products are (descr + image)

The results look quite logical, however some of these maybe make more sense as joint purchases (same time) rather than as predictive for future purchase.

Copied the image path code from the notebook below + some edits

https://www.kaggle.com/gpreda/h-m-eda-and-prediction

In [None]:
image_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
for e in range(CONFIG.EXAMPLE_LIMIT):
    sel_articles = list(counter.most_common()[e][0].split('_'))   
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 5))

    for i, a in enumerate(sel_articles):
        article_id = ("0" + str(sel_articles[i]))[-10:]
        axes.ravel()[i].axis('off')

        try:
            image = Image.open(f"{image_path}{article_id[:3]}/{article_id}.jpg")
            axes.ravel()[i].imshow(image)
        except:
            print(f'image not found for article {a}')
        axes.ravel()[i].set_title(str(a), fontsize=18)
        
    plt.suptitle(sel_articles, fontsize=24)

    show_clear_plt()
    
gc.collect()

# Treemaps of Transactions / Articles

In [None]:
# treemaps of contribution to total transactions
width = 800

fig = px.treemap(articles[(articles['sales'] > 0)],
                 path=[px.Constant("Total"),
                       'index_group_name',
                       'product_type_name',
                       'colour_group_name', ], values='sales',
                 labels='sales',
                color='index_group_name',                
                 color_discrete_sequence=px.colors.qualitative.Pastel2,)

fig.update_layout(title=dict(text=f'<b>Contribution to Transactions by index group <br>name, product type name, color group',
                             font=dict(
                                 family="Arial",
                                 size=24,
                                 color='#000000'
                             )),
                  margin=dict(l=20, r=20, t=100, b=20),
                  height=800,
                  width=width,
                  font=dict(
                      family="Arial Black",
                      size=16,
                      color='#000000'
                  )
                  )

fig.update_traces(marker_line_width=1)
fig.update_traces(marker_line_color='grey')
fig.show()

We can see that trousers are steadily popular during whole year, while T-shirt and Dress for example is more popular in Q2/Q3 (presumably summer)

Sweater and jacket are more popular in Q4/Q1 (presumably winter)

In [None]:
# treemaps of contribution to total transactions by quarter and product type
transactions['product_type_name'] = transactions['article_id'].map(dict(zip(articles['article_id'],
                                                                           articles['product_type_name'])))

summary = transactions.groupby(['product_type_name',
                       'quarter'], as_index=False)['customer_id'].count()

fig = px.treemap(summary,
                 path=[px.Constant("Total"),
                       'product_type_name',
                       'quarter',], values='customer_id',
                 color='quarter',
                 color_continuous_scale='twilight',
                 labels='customer_id')

fig.update_layout(title=dict(text=f'<b>Contribution to Transactions <br>by Product Type / Quarter',
                             font=dict(
                                 family="Arial",
                                size=24,
                                 color='#000000'
                             )),
                  margin=dict(l=20, r=20, t=100, b=20),
                  height=800,
                  width=width,
                  font=dict(
                      family="Arial Black",
                      size=16,
                      color='#000000'
                  )
                  )

fig.update_traces(marker_line_width=1)
fig.update_traces(marker_line_color='grey')
fig.show()

There seem to be some seasonal colour trends, with lighter colours more popular in Q2/Q3.

Red seems to be particularly popular in Q4 (festive?)

In [None]:
# treemaps of contribution to total transactions by quarter and product type
transactions['perceived_colour_master_name'] = transactions['article_id'].map(dict(zip(articles['article_id'],
                                                                           articles['perceived_colour_master_name'])))

summary = transactions.groupby(['perceived_colour_master_name',
                       'quarter'], as_index=False)['customer_id'].count()

fig = px.treemap(summary,
                 path=[px.Constant("Total"),
                       'perceived_colour_master_name',
                       'quarter',], values='customer_id',
                 color='quarter',
                 color_continuous_scale='twilight',
                 labels='customer_id')

fig.update_layout(title=dict(text=f'<b>Contribution to Transactions <br>by Product Perceived Colour / Quarter',
                             font=dict(
                                 family="Arial",
                                 size=24,
                                 color='#000000'
                             )),
                  margin=dict(l=20, r=20, t=100, b=20),
                  height=800,
                  width=width,
                  font=dict(
                      family="Arial Black",
                      size=16,
                      color='#000000'
                  )
                  )

fig.update_traces(marker_line_width=1)
fig.update_traces(marker_line_color='grey')
fig.show()

It is hard to pick out obvious age trends below.

30s-40s looks under-represented in Menswear, and Baby/Children is more popular with customers in range 30-50.

In [None]:
transactions['index_group_name'] = transactions['article_id'].map(dict(zip(articles['article_id'],
                                                                           articles['index_group_name'])))

summary = transactions.groupby(['index_group_name',
                       'age_decade'], as_index=False)['customer_id'].count()

fig = px.treemap(summary,
                 path=[px.Constant("Total"),
                       'index_group_name',
                       'age_decade',], values='customer_id',
                 color='age_decade',
                 color_discrete_sequence=px.colors.qualitative.Pastel2,
                # color_continuous_scale='Blues',
                 labels='customer_id')

fig.update_layout(title=dict(text=f'<b>Contribution to Transactions <br>by Customer Age Bracket',
                             font=dict(
                                 family="Arial",
                                size=24,
                                 color='#000000'
                             )),
                  margin=dict(l=20, r=20, t=100, b=20),
                  height=800,
                  width=width,
                  font=dict(
                      family="Arial Black",
                      size=16,
                      color='#000000'
                  )
                  )

fig.update_traces(marker_line_width=1)
fig.update_traces(marker_line_color='grey')
fig.show()

Most product types see the majority of sales online.

Socks is an exception with store purchases >50% (impulse purchase?)

In [None]:
summary = transactions.groupby(['product_type_name',
                       'sales_channel_name'], as_index=False)['customer_id'].count()

fig = px.treemap(summary,
                 path=[px.Constant("Total"),
                       'product_type_name',
                       'sales_channel_name',], values='customer_id',
                 color='sales_channel_name',
                 color_discrete_sequence=px.colors.qualitative.Pastel2,
                 labels='customer_id')

fig.update_layout(title=dict(text=f'<b>Contribution to Transactions <br>by Product Type / Channel Name',
                             font=dict(
                                 family="Arial",
                                 size=30,
                                 color='#000000'
                             )),
                  margin=dict(l=20, r=20, t=100, b=20),
                  height=800,
                  width=width,
                  font=dict(
                      family="Arial Black",
                      size=16,
                      color='#000000'
                  )
                  )

fig.update_traces(marker_line_width=1)
fig.update_traces(marker_line_color='grey')
fig.show()

Age group 30-40 is the 2nd largest demographic for Online but 4th largest for Store - this age group seems relatively less inclined to shop in person

In [None]:
summary = transactions.groupby(['age_decade',
                       'sales_channel_name'], as_index=False)['customer_id'].count()

fig = px.treemap(summary,
                 path=[px.Constant("Total"),
                       'sales_channel_name',
                       'age_decade',], values='customer_id',
                 color='sales_channel_name',
                 color_discrete_sequence=px.colors.qualitative.Pastel2,
                 labels='customer_id')

fig.update_layout(title=dict(text=f'<b>Contribution to Transactions <br>by Product Type / Channel Name',
                             font=dict(
                                 family="Arial",
                                 size=30,
                                 color='#000000'
                             )),
                  margin=dict(l=20, r=20, t=100, b=20),
                  height=800,
                  width=width,
                  font=dict(
                      family="Arial Black",
                      size=16,
                      color='#000000'
                  )
                  )

fig.update_traces(marker_line_width=1)
fig.update_traces(marker_line_color='grey')
fig.show()

# Image Sampling for various Categories

Copied
the
code
from the notebook
below + some
edits

https://www.kaggle.com/gpreda/h-m-eda-and-prediction


In [None]:
def plot_image_samples(image_article_df, col_name, cols=3, max_examples=10, max_images=10):
    # extract list of top unique entries
    unique_entries = image_article_df[col_name].value_counts().sort_values(ascending=False).index[
                     :max_examples].tolist()

    image_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"

    for count, u in enumerate(unique_entries):
        _df = image_article_df.loc[image_article_df[col_name] == u].sample(frac=1.0, random_state=count)

        sel_articles = _df['article_id'].unique().tolist()[:max_images]
        nr = math.ceil(len(sel_articles) / cols)

        fig, axes = plt.subplots(nrows=nr, ncols=cols, figsize=(8 * cols, 4 + 3 * nr))

        for i, a in enumerate(sel_articles):
            article_id = ("0" + str(sel_articles[i]))[-10:]

            axes.ravel()[i].axis('off')

            try:
                image = Image.open(f"{image_path}{article_id[:3]}/{article_id}.jpg")
                axes.ravel()[i].imshow(image)
            except:
                print(f'image not found for article {a}')
            axes.ravel()[i].set_title(u + str(a), fontsize=34)
        plt.suptitle(u, fontsize=50)

        show_clear_plt()


The code cycles through categories and provides random examples of some of the categorisations using the images to provide a better understanding of how the terminology relates to products.

In [None]:
count_columns = [
    'product_type_name',
    'product_group_name',
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
    'department_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name',
]

for cc in count_columns:
    plot_image_samples(articles, cc, cols=5, max_examples=5, max_images=5)