![image.png](attachment:37bbb267-cea9-4fac-bbf4-f0ba0febc707.png)

In [None]:
#Load all libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import plotly.graph_objects as go
import datetime as dt
import matplotlib.image as mpimg

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">EDA on Datasets</p>

In [None]:
#Load all the datasets
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
# Let's find out the shapes of all three dataframes
shape=pd.DataFrame({"Total Rows":[articles.shape[0],customers.shape[0],transactions.shape[0]],
                    "Total Columns":[articles.shape[1],customers.shape[1],transactions.shape[1]]},index=['articles','customers','transactions'])
shape

In [None]:
n = len(pd.unique(transactions['customer_id'])) 
m = len(pd.unique(customers['customer_id'])) 
length=len(set(transactions.customer_id.values.tolist()))/customers.shape[0]
npur=100-(length*100)
print("Total No of customers:",m)
print("No of customers who made at least one transaction:",n)
print("% of customers who made a at least one transaction : ",length*100)
print("Number of customers who did not make a purchase : ",(customers.shape[0] - len(set(transactions.customer_id.values.tolist()))))
print("% of customers who did not make a purchase : ",npur)
print("It seems that not all customers made a purchase, there is around 1% with no purchase history.")

In [None]:
transactions.head()

In [None]:
customers.head()

In [None]:
articles.head()

In [None]:
#check null values in transaction dataset
transactions.isna().sum()

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles Sold</p>

In [None]:
year=transactions.groupby('t_dat').count()[['article_id']]
yearwise1=year.reset_index()
#yearwise1.head()
print (transactions.t_dat.min())
print (transactions.t_dat.max())

In [None]:
yearasecact = yearwise1.sort_values(['article_id'], ascending=False).head(5)
yearasecact

In [None]:
yearwise1['t_dat'] = pd.to_datetime(yearwise1['t_dat'], format='%Y/%m/%d')

sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,12)

ax.plot('t_dat', 'article_id', data=yearwise1, color='#6a0573', linewidth=2)

plt.tight_layout()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.xaxis.set_major_locator(ticker.MultipleLocator(100))
plt.xticks(rotation=90)
ttl = ax.set_title('Articles Sold from 2018-09-20 to 2020-09-22', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')

x_line_annotation = dt.datetime(2019,9,22)

ax.annotate('Maximum Articles Sold',
            xy=(x_line_annotation, 198522),
            xycoords='data',
            xytext=(55, 0), textcoords='offset points',
            size=15, va="center",
            color='#3e0542',
            bbox=dict(boxstyle="round",facecolor='#f7daf7', edgecolor='#3e0542'),
            arrowprops=dict(arrowstyle="wedge,tail_width=1.",
                            facecolor='#f7daf7', 
                            edgecolor='#3e0542',
                            relpos=(0.1, 0.4)))


ttl.set_position([.5, 1.02])
ax.set_ylabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit
plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

As shown in the graph we see that there are distinct spikes in 3 years.Maximum articles 198,622 are sold on the date 2019-09-28 which breaks all the records in 3 years.where as second maximum 162,799 articles are sold on date 2020-04-11 and third maximum 160,875 articles are sold on date 2019-11-29.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Transaction Amount</p>

In [None]:
yearp=transactions.groupby('t_dat').sum()[['price']]
yearwise1p=yearp.reset_index()
#yearwise1p.head()
print (yearwise1p.price.max())

In [None]:
yearasec = yearwise1p.sort_values(['price'], ascending=False).head(5)
yearasec

In [None]:
yearwise1p['t_dat'] = pd.to_datetime(yearwise1p['t_dat'], format='%Y/%m/%d')

sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,12)

ax.plot('t_dat', 'price', data=yearwise1p, color='#6a0573', linewidth=2)

plt.tight_layout()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
ax.xaxis.set_major_locator(ticker.MultipleLocator(100))
plt.xticks(rotation=90)
ttl = ax.set_title('Transaction amount from 2018-09-20 to 2020-09-22', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
x_line_annotation = dt.datetime(2019,9,22)

ax.annotate('Maximum Amount Transaction',
            xy=(x_line_annotation,6161),
            xycoords='data',
            xytext=(55, 0), textcoords='offset points',
            size=15, va="center",
            color='#3e0542',
            bbox=dict(boxstyle="round",facecolor='#f7daf7', edgecolor='#3e0542'),
            arrowprops=dict(arrowstyle="wedge,tail_width=1.",
                            facecolor='#f7daf7', 
                            edgecolor='#3e0542',
                            relpos=(0.1, 0.4)))

ttl.set_position([.5, 1.02])
ax.set_ylabel('Amount', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit

plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

We see that the maximum transaction amount on date 2019-09-28 is 6,161.where as second maximum transaction is 4,444 on date 2019-11-29 which shows that the transacation amount is 25% decreases in 2019-11-29.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Top Ten Customers</p>

In [None]:
check=transactions.groupby('customer_id').count()[['article_id']].sort_values('article_id', ascending=False)
newcheck=check.head(10)
newcheck1=newcheck.reset_index()

In [None]:
fig = go.Figure(data=[go.Table(
    header=dict(values = [['<b>Customer ID</b><br>Top 10'],
                ['<b>Item Count</b>']],
                fill_color='#d6b6d5',
                font_color="#3e0542",
                align='left'),
    cells=dict(values=[newcheck1.customer_id, newcheck1.article_id],
               fill_color='#faf0fa',
               align='left'))
])

fig.show()

As shown in the above table top 10 customers are those which bought article more than 1,000 in the last three years.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles Sold by Age Distribution</p>

In [None]:
custfin= pd.DataFrame(customers, columns = ['customer_id','age'])

In [None]:
total_tran = pd.merge(custfin,transactions, how='right', on='customer_id')

In [None]:
total_tran["age"].fillna(value=0,inplace=True)
total_tran["age"]=total_tran["age"].apply(int)

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,12)
ax = sns.histplot(data=total_tran, x='age', bins=50,color='#6a0573')

ax.annotate('Age \n not given',
            xy=(0,216100),
            xycoords='data',
            xytext=(55, 0), textcoords='offset points',
            size=10, va="center",
            color='#3e0542',
            bbox=dict(boxstyle="round",facecolor='#f7daf7', edgecolor='#3e0542'),
            arrowprops=dict(arrowstyle="wedge,tail_width=1.",
                            facecolor='#f7daf7', 
                            edgecolor='#3e0542',
                            relpos=(0.1, 0.4)))

ax.axvline(x=20, linestyle='dashed', alpha=0.5,color='#3e0542')
ax.axvline(x=38, linestyle='dashed', alpha=0.5,color='#3e0542')
ax.text(x=20.5, y=3161000, s='Maximum buyers range', alpha=0.7, color='#3e0542',fontweight = 'bold')
ttl.set_position([.5, 1.02])
ax.set_ylabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('Age', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
ttl = ax.set_title('Articles sold by Age', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')

# Adjust subplots so that the title and labels would fit

plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

The distribution shows that the maximum buyer range is from 21 to 39 years.Few customers age are not given.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles Sold by Sales Channel</p>

In [None]:
datanew= pd.DataFrame(total_tran, columns = ['price','t_dat','sales_channel_id','article_id'])
df = datanew.groupby(["t_dat", "sales_channel_id"])["article_id"].count().reset_index()
print (df.article_id.max())
print (df.article_id.min())

In [None]:
dfasec = df.sort_values(['article_id'], ascending=True).head(5)
dfasec

In [None]:
df['t_dat'] = pd.to_datetime(df['t_dat'], format='%Y/%m/%d')

In [None]:
xmin = df["t_dat"].min()
xmax = df["t_dat"].max()

ymin = df["article_id"].min() - 1000
ymax = df["article_id"].max() + 1000

In [None]:
#df['t_dat'] = pd.to_datetime(df['t_dat'], format='%Y/%m/%d')
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,12)

ax.plot(df.loc[df["sales_channel_id"]==1, "t_dat"], df.loc[df["sales_channel_id"]==1, "article_id"], label="Sales Channel 1", color="Darkblue")
ax.plot(df.loc[df["sales_channel_id"]==2, "t_dat"], df.loc[df["sales_channel_id"]==2, "article_id"], label="Sales Channel 2", color="Magenta")
ax.annotate('Maximum Items Sold',
            xy=(x_line_annotation,167500),
            xycoords='data',
            xytext=(55, 0), textcoords='offset points',
            size=15, va="center",
            color='#3e0542',
            bbox=dict(boxstyle="round",facecolor='#f7daf7', edgecolor='#3e0542'),
            arrowprops=dict(arrowstyle="wedge,tail_width=1.",
                            facecolor='#f7daf7', 
                            edgecolor='#3e0542',
                            relpos=(0.1, 0.4)))
ax.set_ylim(ymin, ymax)
ax.set_xlim(xmin, xmax)

ax.fill_betweenx([ymin,ymax],18343, 18384, color="gray", alpha=0.3)

props = dict(boxstyle='round',facecolor='#f7daf7', edgecolor='#3e0542', alpha=0.5)
ax.annotate("Missing transaction\n period", (18342, 17000), (18270, 67000), \
    arrowprops={"arrowstyle": "->", "color":"C1"},
    color='#3e0542',
    bbox=props,
    fontproperties='italic'
    );


ax.set_ylabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')

plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
plt.xticks(rotation=90)
legend=plt.legend(title="Sales Channel ID",labelcolor='linecolor')
plt.setp(legend.get_title(), color='#3e0542',fontweight = 'bold')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
plt.title(f"Articles sold by Sales Channel",fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

There are two sales channel 1 and 2.Sales channel 2 has the highest sold record as compared to sales channel 1.Transactions are also missing for sales channel 1 for few months.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Transaction Amount by Sales Channel</p>

In [None]:
dfp = datanew.groupby(["t_dat", "sales_channel_id"])["price"].sum().reset_index()
print (dfp.price.max())

In [None]:
dfp['t_dat'] = pd.to_datetime(dfp['t_dat'], format='%Y/%m/%d')
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,12)

ax.plot(dfp.loc[dfp["sales_channel_id"]==1, "t_dat"], dfp.loc[dfp["sales_channel_id"]==1, "price"], label="Sales Channel 1", color="Darkblue")
ax.plot(dfp.loc[dfp["sales_channel_id"]==2, "t_dat"], dfp.loc[dfp["sales_channel_id"]==2, "price"], label="Sales Channel 2", color="Magenta")
ax.annotate('Maximum amount',
            xy=(x_line_annotation,5365),
            xycoords='data',
            xytext=(55, 0), textcoords='offset points',
            size=15, va="center",
            color='#3e0542',
            bbox=dict(boxstyle="round",facecolor='#f7daf7', edgecolor='#3e0542'),
            arrowprops=dict(arrowstyle="wedge,tail_width=1.",
                            facecolor='#f7daf7', 
                            edgecolor='#3e0542',
                            relpos=(0.1, 0.4)))


ax.set_ylabel('Amount', color=font_color, fontsize=16, **hfont,fontweight = 'bold')

plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
plt.xticks(rotation=90)
legend=plt.legend(title="Sales Channel ID",labelcolor='linecolor')
plt.setp(legend.get_title(), color='#3e0542',fontweight = 'bold')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
plt.title(f"Transactions Amount by Sales Channel",fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

Sales channel 2 has the highest transaction amount in 3 years.Maximum transaction amount on the date 2019-09-28 are also hit by sales channel 2.Transactions amount are missing for sales channel 1 for few months.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles sold by Product Group</p>

In [None]:
datanew1= pd.DataFrame(transactions, columns = ['article_id','price'])
artdept=pd.DataFrame(articles, columns = ['article_id','colour_group_name','department_name','index_group_name','product_group_name'])


In [None]:
department = pd.merge(artdept,datanew1, how='right', on='article_id')

In [None]:
departmentc=department.groupby(['product_group_name','index_group_name'])['article_id'].count().reset_index()
departmentc1 = departmentc.sort_values(['article_id'], ascending=False)

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,16)
flatui = ["#9b59b6", "#3498db", "#e74c3c", "#34495e", "#2ecc71"]

sns.barplot(data=departmentc1,y="product_group_name", x="article_id",hue="index_group_name",palette=flatui,alpha = 0.6, edgecolor = 'k', linewidth = 2)

plt.tight_layout()


ttl = ax.set_title('Articles Sold by Product Group', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')


ttl.set_position([.5, 1.02])
ax.set_ylabel('Product Group', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
legend=plt.legend(title="Index Group Name")
plt.setp(legend.get_title(),fontweight = 'bold')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit

plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

As shown in the chart Ladieswear index group are more dominant for top 5 product group.However the second index group is divided.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles Sold by Department</p>

In [None]:
departmentp=department.groupby(['department_name'])['article_id'].count().reset_index()
departmentp1 = departmentp.sort_values(['article_id'], ascending=False).head(50)

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,16)
sns.barplot(data=departmentp1,y="department_name", x="article_id",palette="flare",alpha = 0.6, edgecolor = 'k', linewidth = 2)
plt.tight_layout()
ttl = ax.set_title('Articles Sold by Department', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
ttl.set_position([.5, 1.02])
ax.set_ylabel('Department Name', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')

for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit

plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

As shown in chart the articles swimwear,trouser and blouse are the most dominant.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles Sold by Colour</p>

In [None]:
art1=pd.DataFrame(articles, columns = ['article_id','colour_group_name','garment_group_name','section_name','product_type_name'])


In [None]:
dep1 = pd.merge(art1,datanew1, how='right', on='article_id')

In [None]:
dep2=dep1.groupby(['colour_group_name'])['article_id'].count().reset_index()
depa = dep2.sort_values(['article_id'], ascending=False)

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,16)
sns.barplot(data=depa,y="colour_group_name", x="article_id",palette="flare",alpha = 0.6, edgecolor = 'k', linewidth = 2)
plt.tight_layout()
ttl = ax.set_title('Articles Sold by Colour', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
ttl.set_position([.5, 1.02])
ax.set_ylabel('Colour Group', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')

for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit

plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

Black color garments are highly demanded by the customers.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Articles Sold by Garment</p>

In [None]:
dep3=dep1.groupby(['garment_group_name'])['article_id'].count().reset_index()
depa1 = dep3.sort_values(['article_id'], ascending=False)

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,16)
plt.hlines(data=depa1,y="garment_group_name",xmin=0,xmax="article_id",alpha = 0.6, edgecolor = 'k', linewidth = 2,color='#3e0542')
plt.plot(depa1['article_id'], depa1['garment_group_name'], "D")
 
plt.tight_layout()
ttl = ax.set_title('Articles Sold by Garment', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
ttl.set_position([.5, 1.02])
ax.set_ylabel('Grament Group', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('No of Articles', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')

for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit

plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

Jersey fancy and basic are the most frequent garment for customers.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Fashion News Frequency</p>

In [None]:
cus=pd.DataFrame(customers, columns = ['club_member_status','fashion_news_frequency','customer_id'])
trans= pd.DataFrame(transactions, columns = ['customer_id'])

In [None]:
custran = pd.merge(cus,trans, how='right', on='customer_id')

In [None]:
ct1=custran.groupby(['fashion_news_frequency'])['customer_id'].count().reset_index()
ct2 = ct1.sort_values(['customer_id'], ascending=False)
#ct2.head()

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,8)
sns.barplot(data=ct2,x="fashion_news_frequency", y="customer_id",palette="flare",alpha = 0.6, edgecolor = 'k', linewidth = 2)
ttl = ax.set_title('Fashion News Frequency', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
ttl.set_position([.5, 1.02])
ax.set_ylabel('No of Cutomers', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('Fashion News Frequency', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
#for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    #label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit
plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

We see that most of the customers do not recieve ang regular update about fashion news.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Club Member Status</p>

In [None]:
cms1=custran.groupby(['club_member_status'])['customer_id'].count().reset_index()
cms2 = cms1.sort_values(['customer_id'], ascending=False)
#cms2.head()

In [None]:
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(facecolor='#d6b6d5')
fig.set_size_inches(18,8)
sns.barplot(data=cms2,x="club_member_status", y="customer_id",palette="flare",alpha = 0.6, edgecolor = 'k', linewidth = 2)
ttl = ax.set_title('Club Member Status', fontsize=18, pad=18, color=font_color, **csfont,fontweight = 'bold')
ttl.set_position([.5, 1.02])
ax.set_ylabel('No of Cutomers', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
ax.set_xlabel('Club Member Status', color=font_color, fontsize=16, **hfont,fontweight = 'bold')
plt.xticks(color=font_color, **hfont,fontweight = 'bold')
plt.yticks(color=font_color, **hfont,fontweight = 'bold')
#for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    #label.set_fontsize(14)
# Adjust subplots so that the title and labels would fit
plt.subplots_adjust(top=0.85, bottom=0.3, left=0.1, right=0.9)

Most of the customers have an active membership status,only few are in pre-create status and no one with left club status.

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Word Cloud for Description</p>

In [None]:
prod_desc = articles[articles.detail_desc.notnull()].detail_desc.sample(5000).values

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(prod_desc)) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = '#d6b6d5') 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Mean Price for Articles</p>

In [None]:
articles_for_merge = articles[['article_id', 'product_group_name']]

In [None]:
articles_for_merge = transactions[['article_id', 'price', 't_dat']].merge(articles_for_merge, on='article_id', how='left')

In [None]:
articles_for_merge['t_dat'] = pd.to_datetime(articles_for_merge['t_dat'], format='%Y/%m/%d')

In [None]:
product_list = ['Shoes', 'Garment Full body', 'Bags', 'Garment Lower body', 'Underwear/nightwear','Accessories']
colors = ['cadetblue', 'orange', 'mediumspringgreen', 'tomato', 'lightseagreen','skyblue']
k = 0
sns.set(rc={'axes.facecolor':'#faf0fa'}) # graph facecolor
font_color = '#3e0542'
csfont = {'fontname':'Georgia'} # title font
hfont = {'fontname':'Calibri'} # main font
fig, ax = plt.subplots(3, 2, figsize=(20, 15),facecolor = '#d6b6d5')

for i in range(3):
    for j in range(2):
        try:
            product = product_list[k]
            articles_for_merge_product = articles_for_merge[articles_for_merge.product_group_name == product_list[k]]
            series_mean = articles_for_merge_product[['t_dat', 'price']].groupby(pd.Grouper(key="t_dat", freq='M')).mean().fillna(0)
            series_std = articles_for_merge_product[['t_dat', 'price']].groupby(pd.Grouper(key="t_dat", freq='M')).std().fillna(0)
            ax[i, j].plot(series_mean, linewidth=4, color=colors[k])
            ax[i, j].fill_between(series_mean.index, (series_mean.values-2*series_std.values).ravel(), 
                             (series_mean.values+2*series_std.values).ravel(), color=colors[k], alpha=.1)
            ax[i, j].set_title(f'Mean {product_list[k]} price in time',fontsize=12, pad=18, color=font_color, **csfont,fontweight = 'bold')
            ax[i, j].tick_params(axis='x', colors=font_color)
            ax[i, j].tick_params(axis='y', colors=font_color)
           
            k += 1
        except IndexError:
            ax[i, j].set_visible(False)
plt.show()


# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Top 5 Articles with maximum price</p>

In [None]:
max_price_ids = transactions[transactions.t_dat==transactions.t_dat.max()].sort_values('price', ascending=False).iloc[:5][['article_id', 'price']]

In [None]:
f, ax = plt.subplots(1, 5, figsize=(20,10),facecolor = '#d6b6d5')
i = 0
for _, data in max_price_ids.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
    ax[i].imshow(img)
    ax[i].set_title(f'price: {data.price:.2f}',fontweight = 'bold')
    ax[i].set_xticks([], [])
    ax[i].set_yticks([], [])
    ax[i].grid(False)
    ax[i].set_xlabel(desc, fontsize=10,fontweight = 'bold')
    i += 1
plt.show()

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Top 5 Articles with minimum price</p>

In [None]:
min_price_ids = transactions[transactions.t_dat==transactions.t_dat.min()].sort_values('price', ascending=True).iloc[:5][['article_id', 'price']]

In [None]:
f, ax = plt.subplots(1, 5, figsize=(20,10),facecolor = '#d6b6d5')
i = 0
for _, data in min_price_ids.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 4 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
    ax[i].imshow(img)
    ax[i].set_title(f'price: {data.price:.4f}',fontweight = 'bold')
    ax[i].set_xlabel(desc, fontsize=10,fontweight = 'bold')
    ax[i].set_xticks([], [])
    ax[i].set_yticks([], [])
    ax[i].grid(False)
    i += 1
plt.axis('off')
plt.show()

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:90%;text-align:center;">Please check out my notebook on H and M Fashion Recommendation.Click on the link below</p>

# https://www.kaggle.com/code/nadianizam/h-m-fashion-recommendation-with-pyspark

# <p style="background-color:#d6b6d5;font-family:newtimeroman;color:#3e0542;font-size:150%;text-align:center;border-radius:10px 10px;border-style:solid;border-color:#3e0542;">Please do leave your comments /suggestions and if you like this kernel greatly appreciate to UPVOTE</p>