# H&M - EDA - first look into data 



This is very sample EDA for H&M Personalized Fashion Recommendations. I just created this EDA for quick jump into competition. Hope you find it useful for your own competition start. Enjoy and ... have a fun in competiton!

<div align="center"><img src="https://i.ibb.co/xJqnpfJ/HM.jpg"/></div>

# COMPETITION GOAL

In this competition, H&M Group invites you to develop product recommendations based on data from previous transactions, as well as from customer and product meta data. The available meta data spans from simple data, such as garment type and customer age, to text data from product descriptions, to image data from garment images.

This competition required broad ML knnowledge:
- computer vision - there are product images in dataset
- tabular data - three datasets
- NLP - product description contains interesting data ... eg. words "EXCLUSIVE" ..

In [None]:
import cv2
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from os import listdir
from os.path import isfile, join

from termcolor import colored
from IPython.display import HTML

import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
images_dir = '../input/h-and-m-personalized-fashion-recommendations/images'
cat_images = [f for f in listdir(images_dir)]

# EVALUATION METRIC

<div align="center"><img src="https://i.ibb.co/tDcYJhs/hm-metrics.jpg" width=800/></div>

# DATASET INFORMATION

In [None]:
print(f"Number of observations in ARTICLES: {colored(articles.shape, 'yellow')}")
print(f"Number of observations in CUSTOMERS: {colored(customers.shape, 'yellow')}")
print(f"Number of observations in TRANSACTIONS: {colored(transactions.shape, 'yellow')}")

* Competition dataset contains of Tabular data (three datasets - Articles, Customer, Transactions) and product images
* There are three datasets in competion:
    * Articles - 105.542 observations with 25 features
    * Customers - 1.371.980 observations with 7 features
    * Transactions - 31.788.324 observations with 5 features

# DATABASE RELATIONS
<div align="center"><img src="https://i.ibb.co/pRNSPDh/rel.jpg"/ width="480"></div>

In [None]:
# This code was borrowed from https://www.kaggle.com/ishandutta/v7-shopee-indepth-eda-one-stop-for-all-your-needs
def getImagePaths(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

def display_multiple_img(images_paths, rows, cols):
    """
    Function to Display Images from Dataset.
    
    parameters: images_path(string) - Paths of Images to be displayed
                rows(int) - No. of Rows in Output
                cols(int) - No. of Columns in Output
    """
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

def plot_distribution(x, data, title):
        fig = px.histogram(
        data, 
        x = x,
        width = 800,
        height = 500,
        title = title
        )

        fig.show()

In [None]:
def disply_multiple_img_ids(idx, rows, cols):
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8))
    for ind, im_id in enumerate(idx):

        image_path = f'{images_dir}/0{str(im_id)[:2]}/0{im_id}.jpg'
        
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
images_path = getImagePaths(images_dir)

In [None]:
print(f"There are {colored(len(images_path), 'yellow')} number of images in dataset")

In [None]:
display_multiple_img(images_path[50:100], 5, 5)

# QUICK LOOK INTO DATA

## A. ARTICLES

In [None]:
articles.head(3)

In [None]:
articles.info()

In [None]:
articles.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#F8766D')\
                     .bar(subset=["mean",], color='#00BFC4')

In [None]:
print(f"There are {colored(articles.article_id.nunique(), 'yellow')} unique ARTICLES in customers dataset")
print(f"There are {colored(articles.product_code.nunique(), 'yellow')} unique PRODUCT CODES in dataset")
print(f"There are {colored(articles.prod_name.nunique(), 'yellow')} unique PRODUCT NAMES in dataset")
print(f"There are {colored(articles.product_type_no.nunique(), 'yellow')} unique PRODUCT TYPES in dataset")

### LET'S DISCOVER ARTICLES STRUCTURE

In [None]:
articles.query("product_code == 108775").T

In [None]:
disply_multiple_img_ids(articles.query("product_code == 108775").article_id[:3], 1, 3)

### A. PRODUCT TYPE NAME

In [None]:
s = articles.product_type_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['TOP 10 - Product Type Name', '%'], axis=1, inplace=False)

res.head(10)

In [None]:
plot_distribution('product_type_name', articles, 'Product Type Name')

### LET'S LOOK INTO PRODUCTS
#### TROUSERS

In [None]:
disply_multiple_img_ids(articles.query("product_type_name == 'Trousers'").article_id[:5], 1, 5)

#### T-SHIRTS

In [None]:
disply_multiple_img_ids(articles.query("product_type_name == 'T-shirt'").article_id[:5], 1, 5)

### B. PRODUCT GROUP NAME

In [None]:
s = articles.product_group_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['Product Group Name', '%'], axis=1, inplace=False)

res

In [None]:
plot_distribution('product_group_name', articles, 'Product Group Name')

#### SWIMWEAR

In [None]:
disply_multiple_img_ids(articles.query("product_group_name == 'Swimwear'").article_id[:5], 1, 5)

#### BAGS

In [None]:
disply_multiple_img_ids(articles.query("product_group_name == 'Bags'").article_id[:5], 1, 5)

#### GARMENT UPPER BODY AND ... SWEATER

In [None]:
disply_multiple_img_ids(articles.query("product_group_name == 'Garment Upper body' \
                                        and product_type_name =='Sweater'").article_id[:10], 2, 5)

### C. GRAPHICAL APPEARANCE NAME

In [None]:
s = articles.graphical_appearance_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['TOP 10 - Graphical Appearance Name', '%'], axis=1, inplace=False)

res.head(10)

In [None]:
plot_distribution('graphical_appearance_name', articles, 'Product Appearance Name')

#### PLACEMENT PRINT

In [None]:
disply_multiple_img_ids(articles.query("graphical_appearance_name == 'Placement print'").article_id[:5], 1, 5)

#### SEQUIN

In [None]:
disply_multiple_img_ids(articles.query("graphical_appearance_name == 'Sequin'").article_id[:5], 1, 5)

#### PLACEMENT PRINT AND SWIMWEAR

In [None]:
disply_multiple_img_ids(articles.query("graphical_appearance_name == 'Placement print' \
                                        and product_group_name == 'Swimwear'").article_id[:5], 1, 5)

### D. COLOR GROUP NAME

In [None]:
s = articles.colour_group_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['TOP 10 - Colour Group Name', '%'], axis=1, inplace=False)

res.head(10)

In [None]:
plot_distribution('colour_group_name', articles, 'Colour Group Name')

#### OTHER TURQUOISE

In [None]:
disply_multiple_img_ids(articles.query("colour_group_name == 'Other Turquoise'").article_id[:5], 1, 5)

#### GOLD

In [None]:
disply_multiple_img_ids(articles.query("colour_group_name == 'Gold'").article_id[:5], 1, 5)

#### BUT WHAT ABOUT ... GREEN PLACEMENT PRINT SWIMWEAR

In [None]:
disply_multiple_img_ids(articles.query("graphical_appearance_name == 'Placement print' \
                                        and product_group_name == 'Swimwear' \
                                        and colour_group_name == 'Green'").article_id[:5], 1, 5)

### D. PERCEIVED COLOUR VALUE NAME

In [None]:
s = articles.perceived_colour_value_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['TOP 10 - Colour Group Name', '%'], axis=1, inplace=False)

res

In [None]:
plot_distribution('perceived_colour_value_name', articles, 'Perceived Colour Value Name')

#### MEDIUM DUSTY

In [None]:
disply_multiple_img_ids(articles.query("perceived_colour_value_name == 'Medium Dusty'").article_id[:5], 1, 5)

#### DARK

In [None]:
disply_multiple_img_ids(articles.query("perceived_colour_value_name == 'Dark'").article_id[:5], 1, 5)

### E. PERCIVED COLOUR MASTER NAME

In [None]:
s = articles.perceived_colour_master_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['TOP 10 - Perceived Colour Master Name', '%'], axis=1, inplace=False)

res

#### BLUE

In [None]:
disply_multiple_img_ids(articles.query("perceived_colour_master_name == 'Blue'").article_id[:5], 1, 5)

#### METAL

In [None]:
disply_multiple_img_ids(articles.query("perceived_colour_master_name == 'Metal'").article_id[:5], 1, 5)

### F. GARMENT GROUP NAME

In [None]:
s = articles.garment_group_name.value_counts()
s_len = s / len(articles.index) * 100

res = pd.concat([s, s_len], axis=1)\
        .set_axis(['TOP 10 - Garment Group Name', '%'], axis=1, inplace=False)

res

In [None]:
plot_distribution('garment_group_name', articles, 'Garement Group Name')

#### JERSEY BASIC

In [None]:
disply_multiple_img_ids(articles.query("garment_group_name == 'Jersey Basic'").article_id[:5], 1, 5)

#### OUTDOOR

In [None]:
disply_multiple_img_ids(articles.query("garment_group_name == 'Outdoor'").article_id[:5], 1, 5)

### LET'S LOOK INTO ARTICLE DESCRIPTION

#### SHOW 10 PRODUCT DESCRIPTION

In [None]:
HTML(pd.DataFrame(articles.detail_desc.sample(10)).to_html())

#### SHOW MOST COMMON WORDS IN DESCRIPTION

In [None]:
prod_desc = articles[articles.detail_desc.notnull()].detail_desc.sample(5000).values

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(prod_desc)) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

### MORE INFORMATION ABOUT ARTICLES DATASET

In [None]:
print(f"Are there any NaN values? {colored(articles.isnull().values.any(), 'yellow')}")

In [None]:
print("NaN values dsitribution in ARTICLES dataset: ")
articles.isnull().sum(axis = 0)

## B. CUSTOMERS

In [None]:
customers.head(5)

In [None]:
customers.info()

In [None]:
customers.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#F8766D')\
                     .bar(subset=["mean",], color='#00BFC4')

In [None]:
plot_distribution('age', customers, 'Age distribution')

In [None]:
print(f"There are {colored(customers.customer_id.nunique(), 'yellow')} unique customer_id in customers dataset")
print(f"There are {colored(customers.postal_code.nunique(), 'yellow')} unique postal codes in dataset")

In [None]:
print("Fasion frequency news for customer in dataset: ")
customers.fashion_news_frequency.value_counts()

In [None]:
plot_distribution('fashion_news_frequency', customers, 'Fasion News Frequency')

In [None]:
print("Customer status distribution in dataset: ")
customers.club_member_status.value_counts()

In [None]:
plot_distribution('club_member_status', customers, 'Club Member Status')

In [None]:
print(f"Are there any NaN values: {customers.isnull().values.any()}")

In [None]:
print("NaN values dsitribution in CUSTOMER dataset: ")
customers.isnull().sum(axis = 0)

In [None]:
print(f"How many duplicates we have in customer dataset? {colored(customers.duplicated().sum(), 'yellow')}\n")
customers[customers.duplicated()]

## C. TRANSACTIONS

In [None]:
transactions.head(5)

In [None]:
transactions.info()

In [None]:
transactions.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#F8766D')\
                     .bar(subset=["mean",], color='#00BFC4')

In [None]:
print(f"There are {colored(transactions.customer_id.nunique(), 'yellow')} unique customer_id in dataset")
print(f"There are {colored(transactions.article_id.nunique(), 'yellow')} unique articles_id in dataset")
print(f"There are {colored(transactions.sales_channel_id.nunique(), 'yellow')} unique sales_channel_id in dataset")

In [None]:
print("Channel transaction distribution")
transactions.sales_channel_id.value_counts()

In [None]:
print(f"Are there any NaN values? {colored(transactions.isnull().values.any(), 'yellow')}")

In [None]:
print(f"How many duplicates we have in transactions dataset? {colored(transactions.duplicated().sum(), 'yellow')}\n")
transactions[transactions.duplicated()][:10]

## IMAGES DATASET

In [None]:
print(f"There are {colored(len(images_path), 'yellow')} number of images in dataset")
print(f"They are grouped into {colored(len(cat_images), 'yellow')} categories")

### IMAGE RESOLUTON DISTRIBUTION (TOP10) - FOR 1000 IMAGES (DATASET CONTAINS OF 105100 IMAGES)

In [None]:
count = 0
img_shapes = []
for img in images_path:
    image = cv2.imread(img)
    img_shapes.append(image.shape)
    count += 1
    
    if count > 1000:
        break

df_img_shapes = pd.DataFrame({'Shapes': img_shapes})
img_shape_counts = df_img_shapes['Shapes'].value_counts().head(10)

for i in range(len(img_shape_counts)):
    print("Shape %s counts: %d" % (img_shape_counts.index[i], img_shape_counts.values[i]))

In [None]:
plt.figure(figsize=(14, 10))
sns.barplot(x = img_shape_counts.index, y = img_shape_counts.values)
plt.title("Images Dataset")

plt.show()

## WORK IN PROGRESS ...