In [None]:
#Importing libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

import gc

In [None]:
# Uploading data
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
transactions_train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

# colour_group_name
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train['year'] = transactions_train['t_dat'].dt.year
transactions_train['mon'] = transactions_train['t_dat'].dt.month
transactions_train['day'] = transactions_train['t_dat'].dt.day

# Total Vs Transacting Customers

In [None]:
# Total Customers
customers['age_bucket'] = pd.cut(customers['age'], bins = [15, 18, 25, 30, 40, 50, 100])

# Transacting Customers in each bucket
a = transactions_train[['customer_id']]
b = customers[['customer_id', 'age_bucket']]
c = pd.merge(a, b, how = 'inner', on = 'customer_id')
c = c.groupby(by = 'age_bucket').agg({'customer_id':'nunique'}).reset_index()

plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
sns.countplot(data = customers, x='age_bucket', palette='crest')
plt.title('Total Customers')
plt.xlabel('Age Bucket')
plt.ylabel('Count of Customers')

plt.subplot(1, 2, 2)
sns.barplot(data =c, x='age_bucket', y = 'customer_id', palette='crest')
plt.title('Transacting Customers')
plt.xlabel('Age Bucket')
plt.ylabel('Count of Transacting Customers')

<h4 style="color:black;">Count of Total Customers and Transacting Customers is almost same.</h4>

# Article Group - Stock vs Yearly Sales Percentage

In [None]:
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'], infer_datetime_format=True)
transactions_train['year'] = transactions_train['t_dat'].dt.year
transactions_train['mon'] = transactions_train['t_dat'].dt.month
transactions_train['day'] = transactions_train['t_dat'].dt.day
a = transactions_train[['article_id', 'year']]
b = articles[['article_id', 'index_group_name']]
c = pd.merge(a, b, how = 'inner')
d = c.pivot_table(index='year', columns='index_group_name', values = 'article_id', aggfunc='count')
d['total'] = d.sum(axis=1)
d.iloc[:, 0] = np.round((d.iloc[:, 0]/d['total'])*100, 2)
d.iloc[:, 1] = np.round((d.iloc[:, 1]/d['total'])*100, 2)
d.iloc[:, 2] = np.round((d.iloc[:, 2]/d['total'])*100, 2)
d.iloc[:, 3] = np.round((d.iloc[:, 3]/d['total'])*100, 2)
d.iloc[:, 4] = np.round((d.iloc[:, 4]/d['total'])*100, 2)
d.drop(['total'], axis = 1, inplace=True)

e = pd.DataFrame(articles[['index_group_name']].value_counts())
e.columns = ['cnt']
e['pct'] = np.round((e['cnt']/e['cnt'].sum())*100, 2)

plt.figure(figsize=(25, 6))
plt.subplot(1, 3, 1)
sns.heatmap(e[['cnt']], cmap='Blues', annot=True, fmt='d')
plt.xlabel('Stock Count')
plt.ylabel('Article group')

plt.subplot(1, 3, 2)
sns.heatmap(e[['pct']], cmap='Blues', annot=True, fmt='g')
plt.xlabel('Stock in Precentage')
plt.ylabel('Article group')

plt.subplot(1, 3, 3)
sns.heatmap(d, annot=True, cmap='Blues', fmt='g')
plt.title("Year wise Article Group Sales Percentage")
plt.xlabel('Article group')
plt.ylabel('Year')
plt.show()

<h4 style="color:black;">LadiesWear are top in both stock and Sales Percentage.</h4>
<h4 style="color:black;">In Stock Baby/Children group is almost equal to LadiesWear, however for Baby/Children group Sales is low as compared to items in stock.</h4>

# Sales Channel wise Yearly Sales

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data = transactions_train, x='year', palette='Blues', hue='sales_channel_id')

<h4 style="color:black;">Distribution is almost same each year for Sales Channel Id 1 and 2.</h4>


# Top Selling Items and Avg Sales Price - Overall

In [None]:
a = transactions_train.groupby(by = ['article_id']).agg({'article_id':'count', 'price':'mean'})
a.columns = ['Count', 'Avg Sales Price']
a = a.reset_index()
a = a.sort_values(by = ['Count'], ascending=[False])
a = a.head(12)

plt.figure(figsize=(28, 10))
i = 1
for j,x in enumerate(a['article_id'].to_list()):
    try:
        image = Image.open("../input/h-and-m-personalized-fashion-recommendations/images/0"+str(x)[:2]+"/0"+str(x)+".jpg")
        plt.subplot(2, 5, i)
        plt.imshow(image)
        #plt.axis('off')
        plt.title(x)
        plt.xlabel("Sell Count = "+str(a['Count'].to_list()[j]))
        plt.ylabel("Average Sell Price= "+str(np.round(a['Avg Sales Price'], 3).to_list()[j]))
        i+=1
    except:
        pass
    

Note: Products are ignored for which image is not avaialble

# Checking Monthly Seasonality

In [None]:
plt.figure(figsize=(20, 5))
a= transactions_train.groupby(by = ['year', 'mon']).agg({'customer_id':'count'}).reset_index()
ax = sns.lineplot(data = a, x = a['mon'], y = a['customer_id'], hue = 'year', marker="o", palette='gist_rainbow_r', linewidth = 4)
ax.set(xticks=a['mon'].values)
plt.xlabel("Month")
plt.ylabel("Transaction Count")
plt.show()

<h4 style="color:black;">For 2018 data is available from Sep.</h4>
<h4 style="color:black;">For 2020 data is available from Jan to Sep.</h4>
<h4 style="color:black;">We can see growth in transaction from  Feb to June.</h4>

# Top Selling Color Analysis

In [None]:
a = transactions_train[['article_id', 't_dat', 'year', 'mon']]
b = articles[['article_id', 'colour_group_name']]
c = pd.merge(a, b, how = 'inner', on = ['article_id']).reset_index()

## Color wise Count of items sold

In [None]:
c['colour_group_name'].value_counts()

# Let check if there is a correlation between the colors

In [None]:
color_piv = pd.pivot_table(c , index='t_dat', columns=['colour_group_name'], values = 'article_id', aggfunc='count')
plt.figure(figsize=(40, 40))
sns.heatmap(color_piv.corr(), cmap = 'crest', annot = True)
plt.show()

### White has high correlation Blue, Light Blue, Dark Orange seems like people prefer these color combinations (just my intuition). We can see some other good correlations like Red and Blue.