In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

**Article data**

In [None]:
articles.head()

In [None]:
articles.shape

In [None]:
for i in articles.columns:
    print('{} unique number:'.format(i),len(articles[i].unique()))

In [None]:
articles.info()

In [None]:
articles.groupby(['index_group_name', 'index_name']).count()['article_id']

In [None]:
data = articles.groupby(['index_name'])['article_id'].nunique()
x = data.index
y = data.values

fig,ax = plt.subplots(figsize=(15,5))
plt.title('Number of Products per each index_name')

plt.bar(x, y)
ax.set_xticklabels(x, rotation=45, ha='right')
plt.ylabel('counts')
plt.show()

In [None]:
articles.groupby(['product_group_name']).size()

In [None]:
#pd.options.display.max_rows = None
articles.groupby(['product_group_name', 'product_type_name']).count()['article_id']

In [None]:
articles.groupby(['garment_group_name']).size()

In [None]:
# check some sample images of 'Special Offers' and 'Unknown'
import matplotlib.image as mpimg

special_offers = articles[articles['garment_group_name']=='Special Offers'].iloc[:5][['article_id', 'prod_name' ,'detail_desc']]
unknowns = articles[articles['garment_group_name']=='Unknown'].iloc[:5][['article_id', 'prod_name' ,'detail_desc']]

In [None]:
f, ax = plt.subplots(1, 5, figsize=(20,10))
i = 0
for _, data in special_offers.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
    ax[i].imshow(img)
    ax[i].set_title(f'product name: {data.prod_name}')
    ax[i].set_xticks([], [])
    ax[i].set_yticks([], [])
    ax[i].grid(False)
    ax[i].set_xlabel(desc, fontsize=10)
    i += 1
plt.show()

In [None]:
f, ax = plt.subplots(1, 5, figsize=(20,10))
i = 0
for _, data in unknowns.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
    img = mpimg.imread(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
    ax[i].imshow(img)
    ax[i].set_title(f'product name: {data.prod_name}')
    ax[i].set_xticks([], [])
    ax[i].set_yticks([], [])
    ax[i].grid(False)
    ax[i].set_xlabel(desc, fontsize=10)
    i += 1
plt.show()

In [None]:
articles.groupby(['garment_group_name', 'section_name']).count()['article_id']

In [None]:
detail = list(articles['detail_desc'].unique())

In [None]:
detail[:20]

**Customers Data**

In [None]:
customers.head()

In [None]:
customers.info()

In [None]:
len(customers['customer_id'].unique())

In [None]:
customers['postal_code'].nunique()

In [None]:
customers['FN'].unique()

In [None]:
customers.loc[customers['FN'].isna(), 'FN'] = 0
customers['FN'].unique()

In [None]:
customers['Active'].unique()

In [None]:
customers.loc[customers['Active'].isna(), 'Active'] = 0
customers['Active'].unique()

In [None]:
customers['club_member_status'].unique()

In [None]:
customers['fashion_news_frequency'].unique()

In [None]:
# replace the values in fashion_news_frequency
# unify the words meaning 'None'
customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = 'None'
customers['fashion_news_frequency'].unique()

In [None]:
customers.age.describe()

In [None]:
sns.set()
sns.histplot(x=customers['age'], bins=50)

In [None]:
sns.countplot(x=customers['club_member_status'])

In [None]:
fnf_pie = customers[['customer_id', 'fashion_news_frequency']].groupby('fashion_news_frequency').count()
fnf_pie

In [None]:
f, ax = plt.subplots(figsize=(5,5))
ax.pie(fnf_pie.customer_id, labels=fnf_pie.index, autopct='%.2f%%')
ax.set_xlabel('Distribution of fashion news frequency')
plt.show()

**Transaction data**

In [None]:
transactions.head()

In [None]:
transactions.info()

In [None]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'], format='%Y-%m-%d')

In [None]:
transactions.info()

In [None]:
# number of customers who didn't purchase any articles
customers_purchased = list(transactions['customer_id'].unique())
len(customers['customer_id'].unique()) - len(customers_purchased)

In [None]:
# number of articles that weren't purchased
purchased_articles = list(transactions['article_id'].unique())
len(articles['article_id'].unique()) - len(purchased_articles)

In [None]:
channel_id_pie = transactions.groupby('sales_channel_id').size()
channel_id_pie

In [None]:
f, ax = plt.subplots(figsize=(5,5))
ax.pie(channel_id_pie, labels=channel_id_pie.index, autopct='%.1f%%')
ax.set_xlabel('Distribution of sales_channel_id')
plt.show()

**Merge Customer and Transaction data**

In [None]:
transactions.groupby(['customer_id', 't_dat', 'article_id']).sum()['price']

In [None]:
new_transactions = transactions.groupby(['customer_id', 't_dat', 'article_id']).sum()['price'].reset_index()
new_transactions

In [None]:
num_of_transaction = transactions.groupby(['customer_id']).count()['t_dat'].reset_index()

In [None]:
df1 = num_of_transaction.rename(columns={'t_dat': 'num_of_transactions'})
df1

In [None]:
df1['num_of_transactions'].describe()

In [None]:
sns.boxplot(y=df1["num_of_transactions"])

In [None]:
sum_of_purchase = transactions.groupby(['customer_id']).sum()['price'].reset_index()
sum_of_purchase

In [None]:
df2 = sum_of_purchase.rename(columns={'price': 'consumption_amount'})
df2['consumption_amount'].describe()

In [None]:
sns.boxplot(y=df2['consumption_amount'])

In [None]:
df3 = pd.merge(df1, df2, how='outer')

In [None]:
df3['consumption_mean'] = df3['consumption_amount'] / df3['num_of_transactions']
df3

In [None]:
customers1 = pd.merge(customers, df3, how='outer')
customers1

In [None]:
customers1.loc[customers1['num_of_transactions'].isna(), 'num_of_transactions'] = 0
customers1.loc[customers1['consumption_amount'].isna(), 'consumption_amount'] = 0
customers1.loc[customers1['consumption_mean'].isna(), 'consumption_mean'] = 0

In [None]:
# each FN, Active, fashion_news vs. consumpotion_amount
customers1.groupby(['FN'])['consumption_amount'].describe()

In [None]:
customers1.groupby(['Active'])['consumption_amount'].describe()

In [None]:
customers1.groupby(['fashion_news_frequency'])['consumption_amount'].describe()

In [None]:
customers1.loc[customers1['club_member_status'].isna(), 'club_member_status'] = 'None'

In [None]:
customers1.groupby(['club_member_status'])['consumption_amount'].describe()

In [None]:
data = customers1.groupby(['age'])['consumption_amount'].mean()
x = data.index
y = data.values

fig,ax = plt.subplots(figsize=(15,5))
plt.title('consumption  by age')

plt.bar(x, y)
plt.xticks(np.arange(16, 101, step=2))
plt.ylabel('num_of_transactions')
plt.show()