In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS

In [None]:
transactions_train = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
transactions_train.tail(1)

In [None]:
transactions_train['article_id'].value_counts()

In [None]:
transactions_train.info()

In [None]:
transactions_train.isnull().sum()

In [None]:
articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")

In [None]:
articles.head(1)

In [None]:
articles.isnull().sum()

### Merge articles and transactions 

In [None]:
df = transactions_train.merge(articles,on="article_id")

In [None]:
df.head(1)

### Extract year, day and month from the date

In [None]:
def create_year(x):
    year = pd.DatetimeIndex(x).year 
    return year

df['year'] = create_year(df['t_dat'])

In [None]:
# Create a New column that contains the month from the date column
def create_month(x):
    month = pd.DatetimeIndex(x).month_name() 
    return month

df['month'] = create_month(df['t_dat'])

In [None]:
# Create a New column that contains the day from the date column

def create_day(x):
    day = pd.DatetimeIndex(x).day_name() 
    return day

df['day'] = create_day(df['t_dat'])


In [None]:
df.head(2)

In [None]:
data = df.groupby(['index_group_name'])['customer_id'].count().reset_index()

### Index group name Vs Total customer_id bar plot

In [None]:
px.bar(
        data,
        x="index_group_name",
        y="customer_id",
        title="Index group name Vs Total customer_id",
        width=900,
        height=700,
        color="index_group_name",
        color_discrete_sequence=px.colors.qualitative.G10,
    )

In [None]:
dff = (
        df.groupby(["product_type_name","sales_channel_id", "product_group_name"])["price"].sum().sort_values(ascending=False).reset_index()
    )
    

In [None]:
dff.head()

In [None]:
dff_20 = dff.head(20)

In [None]:
px.bar(
        dff,
        x="product_type_name",
        y="product_group_name",
        title="Product type name vs product group name - all",
        color="product_type_name",
        color_discrete_sequence=px.colors.qualitative.D3,
        width=900,
        height=700,
    )

### Product type name vs product group name - Top 20 barplot

In [None]:
px.bar(
        dff_20,
        x="product_type_name",
        y="product_group_name",
        title="Product type name vs product group name - Top 20",
        color="product_type_name",
        color_discrete_sequence=px.colors.qualitative.D3,
        width=900,
        height=700,
    )

In [None]:
px.bar(
        dff_20,
        x="product_type_name",
        y="product_group_name",
        title="Product type name vs product group name - Top 20",
        color="sales_channel_id",
        color_discrete_sequence=px.colors.qualitative.D3,
        width=900,
        height=700,
    )

In [None]:
products = df.groupby('prod_name')['article_id'].count().sort_values(ascending=False).reset_index()

In [None]:
products.head()

In [None]:
products_10 = products.head(10)

### Top 10 product names

In [None]:
labels = products_10["prod_name"]
values = products_10["article_id"]
colors = ["maroon", "black", "orange"]
fig = go.Figure(
        data=[
            go.Pie(
                labels=labels,
                values=values,
                hoverinfo="label+percent",
                textinfo="value",
            )
        ]
    )
fig.update_traces(marker=dict(colors=colors))
fig.show()

### Which sale channel has the most sales?


In [None]:
sales_channel = df.groupby('sales_channel_id')['price'].sum().sort_values(ascending=False).reset_index()

In [None]:
px.pie(
        sales_channel,
        hole=0.2,
        values="price",
        names="sales_channel_id",
        color_discrete_sequence=px.colors.sequential.Aggrnyl,
    )

### Total price over the years

In [None]:
data = df.groupby("year")['price'].sum().reset_index()

In [None]:
data.head()

In [None]:
px.line(data, x="year",y="price")

In [None]:
df.columns

In [None]:
data = df.groupby(['day','prod_name','product_type_name'])['price'].sum().reset_index()

In [None]:
data = data.head(100)

In [None]:
fig = px.sunburst(data, path=['prod_name','product_type_name'], values='price',color_discrete_sequence=px.colors.qualitative.D3)

In [None]:
fig.show()

In [None]:
graphical = df.groupby('graphical_appearance_name')['price'].sum().sort_values(ascending=False).reset_index()

In [None]:
graphical_10 = graphical.head(10)

In [None]:
px.pie(
        graphical_10,
        hole=0.2,
        values="price",
        names="graphical_appearance_name",
        color_discrete_sequence=px.colors.sequential.Jet_r,
    )

In [None]:
# df.head(1000) because running the entire dataset leads to memeory problems on Kaggle
fig = px.icicle(df.head(1000), path=[px.Constant("all"), 'day', 'section_name', 'index_name'], values='price',color_discrete_sequence=px.colors.sequential.Sunset)
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

### Most used words in the article description

In [None]:
# Fails due to memory constrains 
# df_ = df.dropna()
# corpus = ' '.join(df_['detail_desc'])

# corpus = corpus.replace('.', '. ')
# wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white',width=2400,height=2000).generate(corpus)
# plt.figure(figsize=(12,15))
# plt.imshow(wordcloud)
# plt.axis('off')
# plt.show()