This NB has been heavily Inspired and referenced from the following Kernels :
* https://www.kaggle.com/code/remekkinas/h-m-eda-first-look-into-data/notebook
If you have gone through the above NB then this kernel won't be much of a new to you.

In [None]:
from termcolor import colored
import pandas as pd
from glob import glob
import os
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
import random

In [None]:
sns.set_style('darkgrid')

In [None]:
train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',dtype={'article_id':str})
article = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv',dtype={'article_id':str})
customer = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

# Dataset DESC
> 31m Transaction,105k Unique Article & 1m Customers

> Image for almost Each Article

In [None]:
print(f"Number of observations in TRANSACTIONS: {colored(train.shape, 'yellow')}")
print(f"Number of observations in Articles: {colored(article.shape, 'yellow')}")
print(f"Number of observations in Customers: {colored(customer.shape, 'yellow')}")

# **ARTICLES**

# Check 
* For how many Articles do we have corresponding Images and for how many are we missing.
* Missing Articles Images category wise distribution
* Transactions involving Article with/without Images

# Inferred
* Majority of Product are unique will some of them are almost change but slightly differnet in design and coloring scheme.
Images are placed in subfolders starting with the first three digits of the article_id

In [None]:
display(article.columns,article.nunique(),article.head(2),article.isnull().sum())

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_colwidth', 40)

In [None]:
for i in article.columns:
    print(i,article[i].nunique(),article[i].unique()[:5])

In [None]:
images_csv = []
articles_photo = []
for i in tqdm(glob('../input/h-and-m-personalized-fashion-recommendations/images/*')):
    j = glob(os.path.join(i,'*'))
    j = [x.split('/')[-1].split('.')[0] for x in j]
    articles_photo+=j
    images_csv.append([i.split('/')[-1],len(j)])
images_csv = pd.DataFrame(images_csv,columns=['article_img','num_image'])
missing_photo = set(article.article_id) - set(articles_photo)
article.loc[:,'img_present'] = article.article_id.apply(lambda x:0 if x in missing_photo else 1)
print('We have Images for',len(glob('../input/h-and-m-personalized-fashion-recommendations/images/*/*')),'Articles')
print(f"We are missing Images for {colored(len(missing_photo),'red')} Articles")
print(colored("Missing Article ID categorization","red"))
display(article.query('article_id in @missing_photo').groupby('product_group_name').article_id.nunique())

In [None]:
print(len(train.query('article_id in @missing_photo'))*100/len(train),colored('% of Total Transactions are of Non ImageArticle','blue'))
print(len(train.query('article_id in @articles_photo'))*100/len(train),colored('% of Total Transactions are of ImageArticle','blue'))
display(article.groupby(['product_group_name','img_present']).article_id.nunique())

In [None]:
# This code was borrowed from https://www.kaggle.com/ishandutta/v7-shopee-indepth-eda-one-stop-for-all-your-needs
def getImagePaths(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

def display_multiple_img(images_paths, rows, cols):
    """
    Function to Display Images from Dataset.
    
    parameters: images_path(string) - Paths of Images to be displayed
                rows(int) - No. of Rows in Output
                cols(int) - No. of Columns in Output
    """
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        curr_id = image_path.split('/')[-1].split('.')[0]
        image_title = article.query('article_id == @curr_id').product_group_name.values[0]
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_title(image_title)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

def plot_distribution(x, data, title):
        fig = px.histogram(
        data, 
        x = x,
        width = 800,
        height = 500,
        title = title
        )

        fig.show()
        
def disply_multiple_img_ids(idx, rows, cols):
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(12,60))#,figsize=(16,8)
    for ind, im_id in enumerate(idx):

        image_path = f'{images_dir}/{str(im_id)[:3]}/{im_id}.jpg'
        try:
            image=cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
images_dir = '../input/h-and-m-personalized-fashion-recommendations/images'
images_path = getImagePaths(images_dir)

In [None]:
display_multiple_img(random.sample(images_path,50), 5, 8)

* Product Code

In [None]:
print(colored(f'Count of Unique Product Code is {article.product_code.nunique()}','blue'))
print(colored(f'Total Number of Articles {len(article)}','blue'))
# fig = plt.figure(figsize=(100,80))
for i in list(article.groupby('product_code').article_id.nunique().sort_values(ascending=False).index[:10]):
    tmp = article.query("product_code == @i").article_id
    disply_multiple_img_ids(tmp, 1, min(20,len(tmp)))

In [None]:
fig = plt.figure(figsize=(15,15))
sns.countplot(article.product_code.value_counts())
plt.title('Distribution of Number of Articles falling under same product code')
plt.xlabel('Number of Unique Articles')

**Product Group**

In [None]:
a = article.product_group_name.value_counts()
a_len = a / len(article.index) * 100
display(pd.concat([a,a_len],axis=1))

# **Customers**

In [None]:
print(f"{colored(customer.shape,'red')}")
display(customer.columns,customer.nunique(),customer.head(2),customer.isnull().sum())

In [None]:
for i in customer.columns:
    print(i,customer[i].nunique(),customer[i].unique()[:5])

In [None]:
customer.FN.fillna(0,inplace=True)
customer.Active.fillna(0,inplace=True)
customer.club_member_status.fillna("UNK",inplace=True)
customer.fashion_news_frequency.replace('None','none',inplace=True)
customer.fashion_news_frequency.fillna("UNK",inplace=True)

In [None]:
display(customer.FN.value_counts()/len(customer))
display(customer.Active.value_counts()/len(customer))
display(customer.club_member_status.value_counts()/len(customer))
display(customer.fashion_news_frequency.value_counts()/len(customer))

We have 2 peaks at age 24 and 54.

In [None]:
display(customer.age.describe().apply(lambda x: format(x, 'f')))
sns.histplot(customer.age)

# **Transactions**

In [None]:
display(train.info())
display(train.columns,train.nunique(),train.head(2),train.isnull().sum())
train.t_dat = pd.to_datetime(train.t_dat)

In [None]:
sns.countplot(train.sales_channel_id)

In [None]:
fig = plt.figure(figsize=(20,8))
plt.plot(train.t_dat.value_counts().sort_index())

* Some days have more transactions than others.

In [None]:
display(train.groupby('sales_channel_id')['price'].describe())

In [None]:
# sns.countplot(train.price)