In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from collections import Counter, defaultdict
from PIL import Image
from pathlib import Path
path = Path("/kaggle/input/h-and-m-personalized-fashion-recommendations/")

def show_images(article_ids, cols=1, rows=-1):
    if isinstance(article_ids, int) or isinstance(article_ids, str):
        article_ids = [article_ids]
    article_count = len(article_ids)
    if rows < 0: rows = (article_count // cols) + 1
    plt.figure(figsize=(3 + 3.5 * cols, 3 + 5 * rows))
    for i in range(article_count):
        article_id = ("0" + str(article_ids[i]))[-10:]
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        plt.title(article_id)
        try:
            image = Image.open(f"/kaggle/input/h-and-m-personalized-fashion-recommendations/images/{article_id[:3]}/{article_id}.jpg")
            plt.imshow(image)
        except:
            pass

articles = pd.read_csv(path / "articles.csv", dtype = {'article_id': str})

train = pd.read_csv(path / "transactions_train.csv", dtype = {'article_id': str})
train = train[["t_dat", "article_id", "sales_channel_id"]]
train["t_dat"] = pd.to_datetime(train["t_dat"])
train = train.query("sales_channel_id == 2")
train = train.sort_values(["article_id", "t_dat"], ascending=False)

In [None]:
sales_counts = Counter(train.article_id)
for i in articles.index:
    articles.at[i, "sales_count"] = sales_counts[articles.at[i, "article_id"]]

period_df = train.groupby(["article_id"])["t_dat"].agg(lambda x: (list(x)[0], list(x)[-1])).reset_index()
period_df = period_df.merge(articles["article_id"], how="right")

articles["latest"] = period_df["t_dat"].apply(lambda x: None if pd.isna(x) else x[0])
articles["earliest"] = period_df["t_dat"].apply(lambda x: None if pd.isna(x) else x[1])
articles["period"] = (articles.latest.values - articles.earliest).dt.total_seconds() // (60 * 60 * 24)

monthly_sales = {}
YM = [201809, 201810]
while YM[0] < 202010:
    start, end = "-".join(map(str, [YM[0] // 100, YM[0] % 100, 1])), "-".join(map(str, [YM[1] // 100, YM[1] % 100, 1]))
    monthly_sales = Counter(train.query(f"'{start}' <= t_dat < '{end}'").article_id)

    articles[YM[0]] = 0
    for i in articles.index:
        articles.at[i, YM[0]] = monthly_sales[articles.at[i, "article_id"]]

    print("\r Done :", YM[0], end="")
    YM[0] = YM[1]
    YM[1] = (YM[1] + 100 - 11) if YM[1] % 100 == 12 else (YM[1] + 1)

articles.iloc[:, 5:].to_csv("articles_sales_extension.csv")

# Overview

*Modification: I have limited the data to online transactions.*

In this notebook, I would like to think about the sales cycle and the seasonality of items.  
In the H&M competition, the following two points both seem to be important:  
1. Articles that were well sold in the last week will be sold well in the next week than the ones that were well sold in the last year but not in the last week.  
2. A certain number of customers buy the same items repeatedly at H&M stores.  

These two points may have something to do with the sales cycle.  
This is because articles that are not in stock in stores will not be sold well the following week, nor can they be sold repeatedly.  

I hope something in this notebook could be helpful to you.  

# Sales Period

The figures below show the distribution of  
- the earliest purchase date of each item (after Sept. 20, 2018)  
- the latest purchase date of each item (before Sept. 22, 2020)  
- the sales period calculated as the number of days between the earliest and the latest purchase dates

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
sns.histplot(x="earliest", hue="index_group_name", multiple="stack", data=articles.query("period != 0"))
plt.subplot(1, 2, 2)
sns.histplot(x="latest", hue="index_group_name", multiple="stack", data=articles.query("period != 0"))

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(x="period", hue="index_group_name", multiple="stack", data=articles.query("period != 0"))

### Sales Periods of the Monthly Best Selling Items

In [None]:
def plot_sales(article_id, imshow=False):
    plt.figure(figsize=(24, 1.5))
    plot_df = articles.query(f"article_id == '{article_id}'")
    sns.barplot(x=plot_df.columns[29:], y=list(*plot_df.values)[29:], palette=sns.husl_palette(12))
    plt.title(" ".join(["Monthly Sales of ID :", article_id, "    earliest :", str(plot_df.iloc[0, 27])[:10], "    latest :", str(plot_df.iloc[0, 26])[:10]]))
    if imshow:
        show_images(articles.article_id[loc])

In [None]:
temp_df = articles.sort_values([202009, "period"], ascending=False).head(100)
longsellers = temp_df.query("300 < period")
newitems = temp_df.query("period <= 30")
temp_df[["article_id", "product_type_name", "colour_group_name", "period"]].head(30)

# - *Longtime Sellers*

Some of the monthly best selling items are *longtime sellers*, i.e., they were well sold in the whole training period.  
Many of them are popular product types and have popular dark colors.

In [None]:
show_images(list(longsellers.article_id.values[:20]), 10)
for article_id in list(longsellers.article_id.values[:5]):
    plot_sales(article_id)

# - *New Items*

Another part of the monthly best selling items are *new items*, i.e., they were just launched around September 2020.  
It seems to me that many of them are popular seasonal articles and have light colors.

In [None]:
show_images(list(newitems.article_id.values[:20]), 10)
for article_id in list(newitems.article_id.values[:5]):
    plot_sales(article_id)

# - *Summer Clothing*

Some articles that sold well in August did not sell at all in September.  
They seem to be *summer items*, and probably disappeared from the stores by the end of August.  

2020

In [None]:
temp_df = articles.loc[articles[202008] > 500].loc[articles[202009] < 100].sort_values([202008], ascending=False)
show_images(list(temp_df.article_id.values[:20]), 10)
for article_id in list(temp_df.article_id.values[:5]):
    plot_sales(article_id)

2019

In [None]:
temp_df = articles.loc[articles[201908] > 400].loc[articles[201909] < 100].sort_values([201908], ascending=False)
show_images(temp_df.article_id.values[0:20], 10)
for article_id in list(temp_df.article_id.values[:5]):
    plot_sales(article_id)

# - *Autumn Clothing*

Instead, there were a number of items that started selling in September.  
Probably we can call them *autumn clothing*.  
They tend to have autumn-like colours.

2019

In [None]:
temp_df = articles.loc[articles[201908] < 100].loc[articles[201909] > 500].sort_values([201909], ascending=False)
show_images(list(temp_df.article_id.values[:20]), 10)
for article_id in list(temp_df.article_id.values[:5]):
    plot_sales(article_id)

# K-means Clustering by Monthly Sales

The figures below show the results of K-means clustering based on monthly sales of each article.

In [None]:
articles = articles.sort_values(by="sales_count")

from sklearn import cluster
n_clusters = 9
model = cluster.KMeans(n_clusters=n_clusters)
model.fit(articles.iloc[:,29:]) # consider the period

def plot_cluster(k, n=10):
    temp_df = articles[model.labels_==k]
    plt.figure(figsize=(24, 1.5))
    plot_df = temp_df.iloc[:,29:].describe().loc[["mean"]]
    sns.barplot(x=plot_df.columns, y=list(*plot_df.values), palette=sns.husl_palette(12))
    plt.title(" ".join(["Mean Monthly Sales of Cluster :", str(k)]))
    show_images(list(temp_df.article_id.values[:n]), 10)
    show_images(list(temp_df.article_id.values[-n:]), 10)
    return temp_df.iloc[:,[0] + [i for i in range(29, 54)]]

In [None]:
temp_df = plot_cluster(1, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(2, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(3, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(4, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(5, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(6, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(7, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(8, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)

In [None]:
temp_df = plot_cluster(0, 20)
for ID in list(temp_df.head(3).article_id): plot_sales(ID)
for ID in list(temp_df.tail(3).article_id): plot_sales(ID)