In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
articles_data = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv')
customers_data = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')
submission_data = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
trans_data = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

> **Feature Engineering of Customer Data**
> - Check club_member_status Data and take 'ACTIVE' Only 
> - Check Missing Value and Fill Missing Value of Age Data
> - Convert type of Age from float64 to int64

In [None]:
customers_data['club_member_status'].value_counts()

In [None]:
customers_data_new = customers_data[(customers_data['club_member_status']=='ACTIVE')]

In [None]:
customers_data_new.head(5)

> *Only need customer_id and age for Data Set of Customers_data_new*

In [None]:
customers_data_new.drop(labels=['FN','Active','club_member_status','fashion_news_frequency'],axis=1,inplace=True)
customers_data_new.reset_index(drop=True, inplace=True)

In [None]:
customers_data_new.head(5)

In [None]:
customers_data_new.drop(labels=['postal_code'],axis=1,inplace=True)
customers_data_new.reset_index(drop=True, inplace=True)
customers_data_new.head(5)

check memory usage of customers_data_new

In [None]:
customers_data_new.info()

> **Feature Engineering of Articles Data** \
> *Use Data prod_name, product_type_name and product_group_name for Attribute of EDA H&M Transaction of 2019*

In [None]:
articles_data.head(5)

In [None]:
articles_data[['prod_name','product_type_name','product_group_name']].describe()

In [None]:
articles_data_new = articles_data[['article_id','prod_name','product_type_name','product_group_name']].copy()

In [None]:
articles_data_new.head(5)

In [None]:
articles_data_new.isna().sum()

In [None]:
articles_data_new.info()

> **Feature Engineering of Transactional Data** 
> - *Drop or Exclude Data sales_channel_id of Attribute of EDA H&M Transaction of 2019*
> - Build Date Extraction of t_dat (day (name) , month and year) for Attribute of EDA H&M Transaction of 2019
> - Build New Column that is seasons of Transaction Purchases for Attribute of EDA H&M Transaction of 2019

In [None]:
trans_data.dtypes

In [None]:
trans_data['t_dat'] = trans_data['t_dat'].astype('datetime64')
trans_data.dtypes

In [None]:
trans_data['day_trans'] = trans_data['t_dat'].dt.day_name()
trans_data['month_trans'] = trans_data['t_dat'].dt.month
trans_data['year_trans'] = trans_data['t_dat'].dt.year

> ***Now I take Transaction Purchases of 2019 only for EDA and Prediction***

In [None]:
sample_trans_data = trans_data[(trans_data['year_trans']==2019)]

In [None]:
sample_trans_data.isna().sum()

> ***NOW, exlude or drop data t_dat and sales_channel_id from Dataset of Sample Data***

In [None]:
sample_trans_data.drop(labels=['t_dat','sales_channel_id'],axis=1,inplace=True)
sample_trans_data.reset_index(drop=True,inplace=True)

In [None]:
sample_trans_data.isna().sum()

In [None]:
sample_trans_data.info()

In [None]:
sample_trans_data.head(5)

> ***AND NOW, Build EDA of Data Customers***

In [None]:
sns.set_style('whitegrid')
customers_data_new['age'].plot(kind='hist')

> *Build and New Attribute that is **Interval of Age Data** for Next EDA of Customers Data of H&M Transaction*

In [None]:
interval_range_age = pd.interval_range(start=0, freq=10, end=100)
customers_data_new['age_group'] = pd.cut(customers_data_new['age'],bins=interval_range_age)
customers_data_new.head(5)

In [None]:
customers_data_new.isna().sum()

> ***Lets combine Data Transaction Purchases of 2019 and Data Customers for EDA Customer Transcation Purchases of 2019***

In [None]:
purchases_2019 = sample_trans_data.merge(customers_data_new, how='left', on='customer_id')

In [None]:
customers_temp = purchases_2019.groupby(['age_group'])['customer_id'].count()
data_temp_customer = pd.DataFrame({
    'Group Age' : customers_temp.index,
    'Customers' : customers_temp.values
})
data_temp_customer = data_temp_customer.sort_values(['Group Age'],ascending=False)
plt.figure(figsize=(7,7))
plt.title(f'Group Age')
sns.set_color_codes('pastel')
s = sns.barplot(x='Group Age', y='Customers', data=data_temp_customer)
s.set_xticklabels(s.get_xticklabels(),rotation=45)
locs, labels = plt.xticks()
plt.show

> ***Take of Most Age Group of Customers that is (20,30] for EDA day and Seasons Transaction***

In [None]:
#day transaction of Most Age Group of Customers
most_age_group_transaction = purchases_2019[(purchases_2019['age_group']==purchases_2019['age_group'].mode()[0])]
customers_temp_most = most_age_group_transaction.groupby(['day_trans'])['customer_id'].count()
data_temp_customer_most = pd.DataFrame({
    'Day Transaction' : customers_temp_most.index,
    'Customers' : customers_temp_most.values
})
data_temp_customer_most = data_temp_customer_most.sort_values(['Customers'],ascending=False)
plt.figure(figsize=(7,7))
plt.title(f'Day Transaction of Most Age Group Customers')
sns.set_color_codes('pastel')
s = sns.barplot(x='Day Transaction', y='Customers', data=data_temp_customer_most)
s.set_xticklabels(s.get_xticklabels())
locs, labels = plt.xticks()
plt.show()

> *Build and New Attribute that is **Seasons of Transaction** for Next EDA of Customers Data of H&M Transaction*

In [None]:
#New Attribute - Column Seasons
bins = [0,3,6,9,12] #numbers of month on one year
labels = ['Winter','Spring','Summer','Autumn']
purchases_2019['Seasons'] = pd.cut(purchases_2019['month_trans'], bins=bins, labels=labels)
purchases_2019.head(5)

In [None]:
#seasons transaction of Most Age Group of Customers
most_age_group_transaction = purchases_2019[(purchases_2019['age_group']==purchases_2019['age_group'].mode()[0])]
customers_temp_most = most_age_group_transaction.groupby(['Seasons'])['customer_id'].count()
data_temp_customer_most = pd.DataFrame({
    'Seasons Transaction' : customers_temp_most.index,
    'Customers' : customers_temp_most.values
})
data_temp_customer_most = data_temp_customer_most.sort_values(['Customers'],ascending=False)
plt.figure(figsize=(7,7))
plt.title(f'Seasons Transaction of Most Age Group Customers')
sns.set_color_codes('pastel')
s = sns.barplot(x='Seasons Transaction', y='Customers', data=data_temp_customer_most)
s.set_xticklabels(s.get_xticklabels())
locs, labels = plt.xticks()
plt.show()

> **EDA of Customer attribute is done**, \
> This conclusion:
> 

> ***Lets combine Data Transaction Purchases of 2019 + Data Customers and Articles Data for EDA Articles Transcation Purchases of 2019***

In [None]:
purchases_2019.head(5)

> *exclude or drop column price and age, because this attribute is not used for EDA Article Transaction and to reduce memory usege of processing the data*

In [None]:
purchases_2019.drop(labels=['price','age'],axis=1,inplace=True)
purchases_2019.reset_index(drop=True,inplace=True)

In [None]:
purchases_2019.info()

In [None]:
#combine Data with df.merge
purchases_2019 = purchases_2019.merge(articles_data_new, how='left',on='article_id')

In [None]:
purchases_2019.head(5)

In [None]:
#prod_name
articles_temp_data  = purchases_2019.groupby(['prod_name'])['customer_id'].count()
data_temp_articles = pd.DataFrame({
    'Product Name' : articles_temp_data.index,
    'Customers' : articles_temp_data.values
})
data_temp_articles = data_temp_articles.sort_values(['Customers'],ascending=False)[:15]
plt.figure(figsize=(7,7))
plt.title(f'Top 15 Product Name in 2019 Transaction')
sns.set_color_codes('pastel')
s = sns.barplot(x='Product Name',y='Customers',data=data_temp_articles)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show


In [None]:
#product_type_name
articles_temp_data  = purchases_2019.groupby(['product_type_name'])['customer_id'].count()
data_temp_articles = pd.DataFrame({
    'Product Type' : articles_temp_data.index,
    'Customers' : articles_temp_data.values
})
data_temp_articles = data_temp_articles.sort_values(['Customers'],ascending=False)[:15]
plt.figure(figsize=(7,7))
plt.title(f'Top 15 Product Type in 2019 Transaction')
sns.set_color_codes('pastel')
s = sns.barplot(x='Product Type',y='Customers',data=data_temp_articles)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show

In [None]:
#product_group_name
articles_temp_data  = purchases_2019.groupby(['product_group_name'])['customer_id'].count()
data_temp_articles = pd.DataFrame({
    'Product Group' : articles_temp_data.index,
    'Customers' : articles_temp_data.values
})
data_temp_articles = data_temp_articles.sort_values(['Customers'],ascending=False)[:15]
plt.figure(figsize=(7,7))
plt.title(f'Top 15 Product Group in 2019 Transaction')
sns.set_color_codes('pastel')
s = sns.barplot(x='Product Group',y='Customers',data=data_temp_articles)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show

In [None]:
purchases_2019.head(5)

In [None]:
#PREDICTION 