In [None]:
import gc

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from PIL import Image

# Load All Required Data

In [None]:
articles_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
customers_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
transactions_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
print(articles_df.shape)
display(articles_df.head())

In [None]:
print(customers_df.shape)
display(customers_df.head())

In [None]:
print(transactions_df.shape)
display(transactions_df.head())

# Articles EDA

In [None]:
articles_df.info()

In [None]:
articles_df.shape

In [None]:
articles_df.columns


In [None]:
articles_df.nunique() # unique items in each column start with columns with less unique values

## `index_group_name` column

In [None]:
fig = plt.figure(figsize=(12,9))
sns.countplot(data=articles_df, x='index_group_name')
plt.show()

* `index_group_name`  == `index_group_no` 
* `index_group_no` we can consider as label encoding of `index_group_name`
* Highest available products are from Ladieswear
* Least are sports wear

In [None]:
print(articles_df[articles_df['index_group_no'] == 1]['index_group_name'].unique()) # 1
print(articles_df[articles_df['index_group_no'] == 2]['index_group_name'].unique()) # 2
print(articles_df[articles_df['index_group_no'] == 3]['index_group_name'].unique()) # 3
print(articles_df[articles_df['index_group_no'] == 4]['index_group_name'].unique()) # 4
print(articles_df[articles_df['index_group_no'] == 26]['index_group_name'].unique()) # 26

## `index_name` grouped with `index_group_name`

In [None]:
fig = plt.figure(figsize=(20, 10))
sns.histplot(data=articles_df, y='index_group_name', hue='index_name', multiple='stack', shrink=.5)
fig.show()

* Ladieswear has 3 sections
* Baby/Children has 4 sections
* Rest all have 1 section each

## Perceived Colour Count

In [None]:
fig = plt.figure(figsize=(12,9))
sns.countplot(data=articles_df, x='perceived_colour_value_name')
plt.show()

## Grament Type 

In [None]:
fig = plt.figure(figsize=(20, 10))
sns.histplot(data=articles_df, y='garment_group_name', hue='index_group_name', multiple='stack', shrink=.5)
fig.show()

## Product Group Count Plot

In [None]:
fig = plt.figure(figsize=(20, 10))
sns.countplot(data=articles_df, y='product_group_name')
fig.show()

## Top 20 Products in Inventory

In [None]:
fig = plt.figure(figsize=(20, 10))
articles_df['prod_name'].value_counts(ascending=False)[:20].plot(kind='barh')
plt.show()

# Customers EDA

In [None]:
customers_df

In [None]:
customers_df.info()

In [None]:
customers_df.describe()

## Customers Age Distribution

In [None]:
fig = plt.figure(figsize=(12, 9))
sns.histplot(customers_df, x='age', bins=70, kde=True, hue='fashion_news_frequency')
plt.show()

* There are two prominent spikes in ages between `20-30` and `45-60`.
* Same ages also are update with fashion news frequency.
* Monthly and None are very low we cannot see them in graph.

In [None]:
customers_df['fashion_news_frequency'].value_counts().plot(kind='barh')
plt.show()

In [None]:
customers_df['club_member_status'].value_counts().plot(kind='barh')
plt.show()

In [None]:
customers_df['postal_code'].value_counts(ascending=False)

* Almost all customers come from `2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c` this postal code

# Transaction Data EDA

In [None]:
transactions_df.head()

In [None]:
transactions_df.info()

In [None]:
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])

## Transcations Data as TIME SERIES data 

In [None]:
time_series_df = transactions_df.groupby('t_dat')['price'].sum().reset_index()
time_series_df['year'] = time_series_df['t_dat'].apply(lambda x: x.year)
time_series_df['month'] = time_series_df['t_dat'].apply(lambda x: x.month)
time_series_df['day_of_week'] = time_series_df['t_dat'].apply(lambda x: x.day_of_week)
time_series_df

In [None]:
fig = px.line(time_series_df, x='t_dat', y='price', title='Sales as Time Series')

fig.update_xaxes(rangeslider_visible=True)
fig.show()

* Big Spikes on April, Sep, Nov months.

In [None]:
fig = plt.figure(figsize=(12, 4))
transactions_df['sales_channel_id'].value_counts().plot(kind='barh', color='skyblue') # channel 2 sales are higher
plt.show()

## Its time to merge articles with transactions data and see sales of each product 
* Don't merge directly on two dataframes
* first group by `article_id` in transactions.
* then merge with `articles_df` or else you will run out of memory.

In [None]:
prod_sales = transactions_df.groupby('article_id')['price'].sum().reset_index()
prod_sales['total_sales'] = prod_sales['price']
prod_sales.drop(columns=['price'], inplace=True)
prod_sales = prod_sales.merge(articles_df, how='left', on='article_id')
prod_sales

In [None]:
top_20_articles_sold = prod_sales.sort_values(ascending=False, by='total_sales')[:20].reset_index(drop=True)

In [None]:
def plot_images(image_article_df, rows, cols):
    fig,ax =  plt.subplots(rows, cols, figsize=(25, 15))
    k = 0
    for i in range(rows):
        for j in range(cols):
            article_id = image_article_df.loc[k, 'article_id']
            file_path = f"../input/h-and-m-personalized-fashion-recommendations/images/0{str(article_id)[:2]}/0{str(article_id)}.jpg"
            image = Image.open(file_path)
            ax[i][j].imshow(image)
            ax[i][j].set_title(image_article_df.loc[k, 'prod_name'])
            k += 1
    # plt.tight_layout()
    plt.show()

## Top 20 Products in Sales

In [None]:
plot_images(top_20_articles_sold, 4, 5)

## Percentage of Sales by Index Group and hue with index_name 

In [None]:
fig = plt.figure(figsize=(15, 9))
sns.histplot(prod_sales, y='index_group_name', hue='index_name', multiple='stack', shrink=0.6)
plt.show()

* Obiviously Ladieswear will always have high sales.
* Children Sizes 92-140 have more sales in children section.

## Top 20 Product Sales

In [None]:
fig = plt.figure(figsize=(20, 9))
sns.barplot(data=prod_sales.groupby(['prod_name'])['total_sales'].sum().sort_values(ascending=False)[:20].reset_index(),
            y='prod_name', x='total_sales',)
plt.tight_layout()
plt.show()

## Log Scale Distribution of Total Sales

In [None]:
fig = plt.figure(figsize=(12, 9))
sns.histplot(data=prod_sales, x='total_sales', hue='index_group_name', bins=15, multiple='stack', log_scale=True)
plt.show()