In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
df_articles.drop_duplicates(inplace=True)
df_articles['article_id'] = df_articles['article_id'].astype('int32')
print("Missing values (%):")
print(df_articles.isna().sum() * 100 / len(df_articles))

In [None]:
df_customers = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
df_customers.drop_duplicates(inplace=True)
df_customers['customer_id'] =\
    df_customers['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
print("Missing values (%):")
print(df_customers.isna().sum() * 100 / len(df_customers))

In [None]:
df_transactions = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
df_transactions['article_id'] = df_transactions['article_id'].astype('int32')
df_transactions['customer_id'] =\
    df_transactions['customer_id'].apply(lambda x: int(x[-16:],16) ).astype('int64')
df_transactions.t_dat = pd.to_datetime(df_transactions.t_dat)
df_transactions['year'] = (df_transactions.t_dat.dt.year-2000).astype('int8')
df_transactions['month'] = (df_transactions.t_dat.dt.month).astype('int8')
df_transactions['day'] = (df_transactions.t_dat.dt.day).astype('int8')
print("Missing values (%):")
print(df_transactions.isna().sum() * 100 / len(df_transactions))

In [None]:
bestsellers_ranking = df_transactions.groupby('article_id').count().sort_values(by='customer_id', ascending=False)

In [None]:
article_sales = df_transactions.groupby('article_id').count()
def f(x):
    try:
        return article_sales.loc[x]['customer_id']
    except:
        return 0
df_articles['total_sales_2019'] = df_articles['article_id'].apply(f)

In [None]:
to_drop = ['product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_code',
 'index_group_no',
 'section_no',
 'garment_group_no']
df_articles.drop(columns=to_drop, axis=1, inplace=True)

In [None]:
bestsellers = df_articles.sort_values(by='total_sales_2019', ascending=False)

**Color Group of Bestsellers on H&M**

In [None]:
plt.figure(figsize=(15,5))
g = sns.countplot(x="colour_group_name",data=bestsellers,palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('Color Group of Bestsellers on H&M')
plt.show(g)

In [None]:
plt.figure(figsize=(15,5))
g = sns.countplot(x="garment_group_name",data=df_articles,palette="pastel")
g.bar_label(g.containers[0])
g.tick_params(axis='x', rotation=90)
plt.title('All Articles on H&M')
plt.show(g)
plt.figure(figsize=(15,5))
h = sns.countplot(x="garment_group_name", data=bestsellers,palette="pastel")
h.bar_label(h.containers[0])
h.tick_params(axis='x', rotation=90)
plt.title('Bestselling Articles on H&M')
plt.show(h)

In [None]:
bestsellers_transactions = df_transactions[df_transactions['article_id'].isin(bestsellers['article_id'])]
bestsellers_contributors = df_customers[df_customers['customer_id'].isin(bestsellers_transactions['customer_id'])]

In [None]:
g = sns.histplot(df_customers['age'],kde=False)
plt.title('Age of All Customers')
plt.show(g)
g = sns.histplot(bestsellers_contributors['age'],kde=False)
plt.title('Customers Age Who Bought Bestsellers')
plt.show(g)

In [None]:
df_articles.columns

In [None]:
article_revenue = df_transactions.groupby('article_id').sum()
def f(x):
    try:
        return article_revenue.loc[x]['price']
    except:
        return 0
df_articles['total_revenue_2019'] = df_articles['article_id'].apply(f)

In [None]:
index_group_revenue = df_articles[['index_group_name','total_revenue_2019']].groupby('index_group_name').total_revenue_2019.sum().reset_index()

**Revenue % by each Group**

In [None]:
plt.figure(figsize=(8,8))
colors = sns.color_palette('pastel')
plt.pie(x=index_group_revenue['total_revenue_2019'], labels=index_group_revenue['index_group_name'], colors=colors, autopct='%1.1f%%')
plt.title('Revenue Contribution by Index Group')
plt.show()