In [None]:
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
DIR = "../input/h-and-m-personalized-fashion-recommendations"
articles = pd.read_csv(os.path.join(DIR, "articles.csv"))
customers = pd.read_csv(os.path.join(DIR, "customers.csv"))
transactions = pd.read_csv(os.path.join(DIR, "transactions_train.csv"))
sample_sub = pd.read_csv(os.path.join(DIR, "sample_submission.csv"))

print(f"artiles data shape: {articles.shape}")
print(f"cusomters data shape: {customers.shape}")
print(f"transactions data shape: {transactions.shape}")
print(f"sample submission shape: {sample_sub.shape}")

display(articles.head())
display(customers.head())
display(transactions.head())
display(sample_sub.head())

## Data Describe  

**Artcles**

|column|type|describe|nunique|  
|:---:|:---:|:---:|:---:|  
|artcle_id|int|id of articels|105542|  
|product_code|int|product code|47224|
|prod_name|object|product name. corresponds to product_code|45875|  
|product_type_no|int|product type number|132|  
|product_type_name|object|product type name. coressponds to product_type_no|131|  
|product_group_name|object|product group name|19|
|graphical_appearance_no|int|number mapped each product appearance|30|  
|graphical_appearance_name|object|product appearance name. coressponds to graphical_appearance_no|30|  
|colour_group_code|int|code mapped each specific colors|50|  
|colour_group_name|object|specific color code. corresponds to colour_group_code|50|  
|preceived_colour_value_id|int|mapped id each color theme|8|  
|preceived_colour_value_name|object|color theme. corresponds to preceived_colour_value_id|8|  
|preceived_colour_master_id|int|mapped id each abstract colors|20|  
|preceived_colour_master_name|object|abstract color name. corresponds to preceived_colour_master_id|20|  
|department_no|int|number mapped each department|299|  
|department_name|object|department name. corresponds department_no|250|  
|index_code|object|index code|10|  
|index_name|object|index name. corresponds index_code|10|  
|index_group_no|int|index group number|5|  
|index_group_name|object|index group name. corresponds index_group_no|5|  
|section_no|int|section number|57|  
|section_name|object|section name|56|  
|garment_group_no|int|garment group number|21|  
|garment_gruop_name|object|garment gruop name. corresponds garment_group_no|21|  
|detail_desc|object(NL)|sentence explaning the artcle|43404|  


**customers**  

|column|type|describe|nunique|  
|:---:|:---:|:---:|:---:|  
|customer_id|int|id of customers|1371980|  
|FN|float|unknown|1(1.0 or NaN)|  
|Active|float|indicates whether the customer is active|1(1.0 or NaN)|  
|club_member_status|object|indicates whether the customer is a club member|3|  
|fashion_news_frequency|object|how often the customer receive news|4|  
|age|float|the customer age|84|  
|postal_code|oejbect|the customer postal code|352899|   

**transactions**  

|column|type|describe|nunique|
|:---:|:---:|:---:|:---:|  
|t_dat|object|data and time when the transaction occured|734(about 2 years)|  
|customer_id|int|customer id|1362281|  
|article_id|int|article id|104547|  
|price|float|the article price|9857|  
|sales_channel_id|int|channel id|2|  

- Some ids and names don't have an one-to-one coresspondence. For example,, number of unique "product_type_no" is 132, but "product_type_name" is 131.  
- Considered different bought articles for each customer age. Also related what the customer bought most recently.
- The forecast period is the end of September(2021.9.23-2021.9.29). -> I should analysis at the same period.  

In [None]:
product_dict = dict()

for no, name in zip(articles["product_type_no"].values, articles["product_type_name"].values):
    if name not in product_dict.keys():
        product_dict[name] = no
    else:
        if product_dict[name] != no:
            print(f"name: {name}, no: {product_dict[name]}")
            print(f"name: {name}, no: {no}\n")

            break

In [None]:
np.sort(articles["product_type_no"].unique())

In [None]:
display(articles.query("product_type_no == -1"))

- data that "product_type_name" is umbrella has 83 or 532 for "product_type_no".  
- it indicate "product_type_name" is unknown that "product_type_no" is -1.  

### Preprocessing data  
**add columns**  
- year  
- month  
- day  
- binning age  

**merge data**  
- trainsaction + product_type_name(articles) + age(customers) + bin_age(customers)  

In [None]:
# add columns to transactions
transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])

# add year, month, day
transactions["year"] = transactions["t_dat"].dt.year
transactions["month"] = transactions["t_dat"].dt.month
transactions["day"] = transactions["t_dat"].dt.day

display(transactions.head())

In [None]:
# merge product data and transaction data
transactions = pd.merge(transactions, articles[["article_id", "product_type_name"]],
                        on="article_id")
display(transactions.head())

In [None]:
# merge customers age and transaction data
transactions = pd.merge(transactions, customers[["customer_id", "age"]],
                        on="customer_id")
display(transactions.head())

In [None]:
# binning age
bins = [i for i in range(10, 101, 10)]
labels = [i for i in range(1, len(bins))]

customers["bin_age"] = pd.cut(customers["age"], bins=bins, labels=labels)
transactions = pd.merge(transactions, customers[["customer_id", "bin_age"]],
                        on="customer_id")
display(transactions.head())

### Analyse by period  
- Plot yearly popular products  
- Plot yearly popular products during from 9.23 to 9.29.  
- If you analyse other column, you need to change "product_type_name" to other column.  

In [None]:
# count number of product sold each year
year_prod_count = transactions.groupby(["year"])["product_type_name"].value_counts()

# make dataframe.
# index: years columns: product names
year_prod = pd.DataFrame(
    index=[2018, 2019, 2020],
    columns=articles["product_type_name"].unique()
)

year_prod = year_prod.fillna(0)

for year in year_prod_count.index.get_level_values("year").unique():
    for prod in year_prod_count.loc[year].index.get_level_values("product_type_name"):
        year_prod.loc[year, prod] = year_prod_count.loc[year].loc[prod]

display(year_prod)

In [None]:
# plot top 10 sold product on each year
f, ax = plt.subplots(nrows=3, ncols=1, figsize=(12, 24))
ax = ax.flatten()

for i, year in enumerate(year_prod.index):
    tmp_df = year_prod.loc[year]

    # descending sort
    indices = tmp_df.values.argsort()[::-1]

    # extract top 10
    columns = year_prod.columns[indices][:10]

    # top 10 sold product bar plot
    ax[i].bar(columns, tmp_df[columns])
    ax[i].set_xticklabels(columns)
    ax[i].set_title(f"top 10 product name on {year}")

plt.show()

### Analyse end of September each year  
- We need to predict products that customers will buy in next week(from 2020.9.23 to 2020.9.29) so I plot data in same period in different years(from 2018.9.23 to 2018.9.29 and from 2019.9.23 to 2019.9.29).  

In [None]:
# extract transaction data from 9.23 to 9.29
target_prod_count = transactions.query("month == 9 and day >= 23 and day <= 29").groupby(["year"])["product_type_name"].value_counts()

# initialize dataframe
target_prod = pd.DataFrame(
    index=[2018, 2019],
    columns=articles["product_type_name"].unique()
)

target_prod = target_prod.fillna(0)

# count product data
for year in target_prod_count.index.get_level_values("year").unique():
    for prod in target_prod_count.loc[year].index.get_level_values("product_type_name"):
        target_prod.loc[year, prod] = target_prod_count.loc[year].loc[prod]

display(target_prod)

In [None]:
# compare target period and close period
# extract transaction data from 9.1 to 9.22
close_prod_count = transactions.query("month == 9 and day < 23").groupby(["year"])["product_type_name"].value_counts()

# initialize dataframe
close_prod = pd.DataFrame(
    index=[2018, 2019, 2020],
    columns=articles["product_type_name"].unique()
)

close_prod = close_prod.fillna(0)

# count product data
for year in close_prod_count.index.get_level_values("year").unique():
    for prod in close_prod_count.loc[year].index.get_level_values("product_type_name"):
        close_prod.loc[year, prod] = close_prod_count.loc[year].loc[prod]

display(close_prod)

In [None]:
# plot top 10 sold product on each year
f, ax = plt.subplots(nrows=3, ncols=2, figsize=(40, 40))
ax = ax.flatten()

# plot close period(from 9.1 to 9.22) data(left column)
pos = 0
for year in close_prod.index:
    tmp_df = close_prod.loc[year]

    # descending sort
    indices = tmp_df.values.argsort()[::-1]

    # extract top 10
    columns = close_prod.columns[indices][:10]

    # top 10 sold product bar plot
    ax[pos].bar(columns, tmp_df[columns])
    ax[pos].set_xticklabels(columns)
    ax[pos].set_title(f"top 10 product name on {year}(close period data)")
    pos += 2

# plot target period(from 9.23 to 9.29) data(right column)
pos = 1
for year in target_prod.index:
    tmp_df = target_prod.loc[year]

    # descending sort
    indices = tmp_df.values.argsort()[::-1]

    # extract top 10
    columns = target_prod.columns[indices][:10]

    # top 10 sold product bar plot
    ax[pos].bar(columns, tmp_df[columns])
    ax[pos].set_xticklabels(columns)
    ax[pos].set_title(f"top 10 product name on {year}(target period data)")
    pos += 2

plt.show()

### Analyse by customers  
- Plot popular products for each custoemrs(divide bin age).  

In [None]:
# plot customer disribution by age
# use data included customers
plt.figure(figsize=(10, 10))
plt.hist(customers["age"])
plt.title("registerd custoemrs distribution")
plt.show()

In [None]:
# use data included transactions
# to indicate the age group who often shop
plt.figure(figsize=(10, 10))
plt.hist(transactions["age"])
plt.title("shop customers distribution")
plt.show()

In [None]:
# count number of sold product each bin age
# age of customers included nan so drop nan data
customer_prod_count = transactions.groupby(["bin_age"])["product_type_name"].value_counts()

# initialize dataframe
customer_prod = pd.DataFrame(
    index=np.sort(np.array(customers["bin_age"].unique().dropna())),
    columns=articles["product_type_name"].unique()
)

customer_prod = customer_prod.fillna(0)

# count product data
for bin_age in customer_prod_count.index.get_level_values("bin_age").unique():
    for prod in customer_prod_count.loc[bin_age].index.get_level_values("product_type_name"):
        customer_prod.loc[bin_age, prod] = customer_prod_count.loc[bin_age].loc[prod]

display(customer_prod)

In [None]:
# plot top 10 sold product names each bin age
f, ax = plt.subplots(nrows=9, ncols=1, figsize=(12, 24))
ax = ax.flatten()

top_products = []

for i, bin_age in enumerate(customer_prod.index):
    tmp_df = customer_prod.loc[bin_age]

    # descending sort
    indices = tmp_df.values.argsort()[::-1]

    # extract top 10
    columns = customer_prod.columns[indices][:10]
    top_products += columns.tolist()

    # top 10 sold product bar plot
    ax[i].bar(columns, tmp_df[columns])
    ax[i].set_xticklabels(columns)
    ax[i].set_title(f"top 10 product name for {int(bin_age)*10}'s")

top_products = set(top_products)
plt.tight_layout()
plt.show()

In [None]:
print(len(top_products), "\n")

for name in top_products:
    print(name)