[](https://www.qwant.com/?client=brz-brave&t=images&q=H%26M&o=0%3AD0088AD8AB7DF001697104F51E1D4EDFCA0CCA1B)

<img src="https://images.unsplash.com/photo-1578983662508-41895226ebfb?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1211&q=80" width=600></img>


Data: There are four csv files:

* articles.csv: A dictionary of each of every product sold by H&M with its characteristics.
* transactions_train.csv: Our main data file, which showcases all of training relevant data.
* customers.csv: A dictionary related to each customer. (like articles, but with customer related info)
* submission.csv: A sample on how to make a submission format.


In [None]:
#Import packages
import os
import pandas as pd
import numpy as np
import plotly
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import umap
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
path = Path('/kaggle/input/h-and-m-personalized-fashion-recommendations/')
articles = pd.read_csv(path / 'articles.csv')
customers = pd.read_csv(path / 'customers.csv')
transaction = pd.read_csv(path / 'transactions_train.csv')


# 1.Data head and structure

In [None]:
print(articles.info())
print("#"*30)
print("missing data:",articles.isnull().sum().sort_values(ascending = False))
articles.head(2)

* For the articles we have 416 data missing for the detail description otherwise there is no others missing data.
* We have a lot of different kind of articles, we could aggregate them by class.

In [None]:
print(customers.info())
print("#"*30)
print("missing data:",customers.isnull().sum().sort_values(ascending = False))
customers.head(2)

* the customers file has many missing data, all most of columns contain missing data.


In [None]:
transaction['t_dat'] = pd.to_datetime(transaction['t_dat'])
transaction.dtypes
print(transaction.info())
print("#"*30)
print("missing data:",transaction.isnull().sum().sort_values(ascending = False))
transaction.head(2)

* The transaction file has no missing data
* We can like the transaction file with the others by:
                                                    - the customer_id => customers
                                                    - article_id => articles

# 2.Visualisations:

## 2.1Transaction data

## What is the best sales channel?

In [None]:
cust_ch = transaction.groupby(['sales_channel_id'])['customer_id'].count()
plt.figure(figsize = (8,6))
g1 = sns.barplot(x = cust_ch.index,  y= cust_ch.values)
plt.title(f'Number of customers by channel')
locs,labels = plt.xticks()
cust_ch

* The channel 2 is the best channel of we have a lot of clients.


## What are the 100 best buyers?

In [None]:
top1 = transaction.groupby(['sales_channel_id','customer_id'])['price'].sum()
top10 = top1.reset_index()
top100 = top10.sort_values(by =['price'],ascending=False)[:100]
top100

plt.figure(figsize = (20,8))
g2 = sns.barplot(x = top100.customer_id,  y= top100.price, hue = top100.sales_channel_id)
g2.set_xticklabels(g2.get_xticklabels(),rotation=90)
plt.title(f'100 Best Buyers')
locs,labels = plt.xticks()

## What are the 50 best days of sales?

In [None]:
date = transaction.groupby(['t_dat', 'sales_channel_id'])['price'].sum()
date = date.reset_index()
date_50 = date.sort_values(by =['price'],ascending=False)[:50]

plt.figure(figsize = (18,6))
g3 = sns.barplot(x = date_50.t_dat,  y= date_50.price, hue = date_50.sales_channel_id)
g3.set_xticklabels(g3.get_xticklabels(),rotation=90)
locs,labels = plt.xticks()
plt.title(f'The 50 top best days of sales')

* The best days and the best clients are both on the sales channel 2.

## 2.2Customers Data

### What is the age distribution of the clients?

In [None]:
age1 = customers.groupby(['age'])['customer_id'].count()
age = age1.sort_values(ascending = False)
plt.figure(figsize = (20,8))
g4 = sns.barplot(x = age.index,  y= age.values)
g4.set_xticklabels(g4.get_xticklabels(),rotation=90)
plt.title(f'Number of customers per age')

* most of the clients are young.

### What is the distrubition of each fashion News Frequency by customers?

In [None]:
freq = customers.groupby(["fashion_news_frequency"])["customer_id"].count().sort_values(ascending = False)
plt.figure(figsize = (8,8))
g3 = sns.barplot(x = freq.index,  y= freq.values)
g3.set_xticklabels(g3.get_xticklabels(),rotation=90)
plt.title(f'Number of customers per each Fashion News Frequency')

In [None]:
fashion = customers.loc[customers.fashion_news_frequency == 'NONE']
fash = fashion.groupby(['age'])['customer_id'].count()
plt.figure(figsize = (16,8))
g4 = sns.barplot(x = fash.index,  y= fash.values)
g4.set_xticklabels(g4.get_xticklabels(),rotation=90)
plt.title(f'Number of None customers by age')

fashion1 = customers.loc[customers.fashion_news_frequency == 'Regularly']
fash1 = fashion1.groupby(['age'])['customer_id'].count()
plt.figure(figsize = (16,8))
g5 = sns.barplot(x = fash1.index,  y= fash1.values)
g5.set_xticklabels(g5.get_xticklabels(),rotation=90)
plt.title(f'Number of customers regularly by age')

* The fashion news frequency has no significant value, there is  the same age distribution for the clients between None and regularly.

## 2.3Articles Data

### Distribution of the articles by product type name

In [None]:
product = articles.groupby(['product_type_name'])['article_id'].count().sort_values(ascending=False)
plt.figure(figsize = (22,6))
g6 = sns.barplot(x = product.index,  y= product.values)
g6.set_xticklabels(g6.get_xticklabels(),rotation=90)
plt.title(f'Number of articles by product type')

### Distribution of the articles by garment group?

In [None]:
garment = articles.groupby(['garment_group_name'])['article_id'].count().sort_values(ascending=False)
plt.figure(figsize = (16,8))
g7 = sns.barplot(x = garment.index,  y= garment.values)
g7.set_xticklabels(g7.get_xticklabels(),rotation=90)
plt.title(f'Number of articles by Garment group name')

### Distribution of the articles by section name

In [None]:
section = articles.groupby(['section_name'])['article_id'].count().sort_values(ascending=False)
plt.figure(figsize = (22,6))
g8 = sns.barplot(x = section.index,  y= section.values)
g8.set_xticklabels(g8.get_xticklabels(),rotation=90)
plt.title(f'Number of articles by section name')

### Distribution of the articles by index group name

In [None]:
product = articles.groupby(['index_group_name'])['article_id'].nunique().sort_values(ascending=False)
plt.figure(figsize = (22,6))
g8 = sns.barplot(x = product.index,  y= product.values)
plt.title(f'Number of articles by index group name')

* Index group name is a good column to filter by a subcategory of articles.
'Ladieswear', 'Baby/Children', 'Menswear', 'Sport', 'Divided'



# 3.Merge the transaction and article table
### what is the total revenue by index group name

In [None]:
df = transaction[['customer_id', 'article_id','price']]
artic1 = articles[['article_id','product_type_no','product_type_name','index_group_name','section_name']]
res = artic1.merge(df,left_on='article_id', right_on='article_id', how='left')
best_article = res.groupby(['index_group_name'])['price'].sum().sort_values(ascending=False)
plt.figure(figsize = (18,8))
g9 = sns.barplot(x = best_article.index,  y= best_article.values)
plt.title(f'Revenue by index group name')

* Ladieswear is the most profitable