In [None]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# Introduction

Not all products are equal when it comes to repeated purchases. You're far more likely to purchase the same pair of socks that you bought a few months ago than a wedding dress. This notebook explores how many of the purchases from 2020 on were a repurchase of a specific product. Only products sold in the last month are subject of this analysis, as it's a simple way of cleaning for changes in the product palette. The rate is calculated as the number of purchases after the first purchase divided by total purchases. The quantity per purchase and first purchases before 2020 were not taken into account to keep it simple. Products with less than 20 total purchases in this time period were also discarded. A low rate can also stem from a recent introduction of the product.

In [None]:
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={'article_id': str})
articles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype={'article_id': str})

transactions = transactions[transactions['article_id'].isin(transactions['article_id'][transactions['t_dat'] >= '2020-09-01'])]
transactions = transactions[transactions.t_dat >= '2020-01-01']
transactions = transactions.drop_duplicates()

In [None]:
cust_dict = {}
repurchase = {}

for customer,i in tqdm(zip(transactions.customer_id, transactions.article_id)):
    if customer not in cust_dict:
        cust_dict[customer] = set()
    if i not in repurchase:
        repurchase[i] = [0,0]
    if i in cust_dict[customer]:
        repurchase[i][0] += 1
    else:
        cust_dict[customer].add(i)
    repurchase[i][1] += 1
        
        

art = []
rate = []
total_sales = []

for a in tqdm(repurchase):
    art.append(a)
    rate.append(repurchase[a][0]/repurchase[a][1])
    total_sales.append(repurchase[a][1])
    
df = pd.DataFrame({'article': art, 'rate': rate, 'total_purchases': total_sales})
df = df[df.total_purchases >= 20]
top20 = df.sort_values('rate', ascending=False)[:20]
bottom20 = df.sort_values('rate')[:20]
top20.set_index('article', inplace=True)
bottom20.set_index('article', inplace=True)
top20 = pd.concat([top20, articles[articles['article_id'].isin(top20.index)].set_index('article_id').prod_name], axis=1)
bottom20 = pd.concat([bottom20, articles[articles['article_id'].isin(bottom20.index)].set_index('article_id').prod_name], axis=1)

In [None]:
articles.set_index('article_id', inplace=True)
df.set_index('article', inplace=True)
articles = pd.concat([articles, df], axis=1)

# code for displaying images grabbed from https://www.kaggle.com/negoto/best-selling-items-catalog-like-eda-of-articles
from PIL import Image
def show_images(article_ids, cols=1, rows=-1):
    if isinstance(article_ids, int) or isinstance(article_ids, str):
        article_ids = [article_ids]
    article_count = len(article_ids)
    if rows < 0: rows = (article_count // cols) + 1
    plt.figure(figsize=(3 + 3.5 * cols, 3 + 5 * rows))
    for i in range(article_count):
        article_id = ("0" + str(article_ids[i]))[-10:]
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        plt.title(article_id)
        try:
            image = Image.open(f"/kaggle/input/h-and-m-personalized-fashion-recommendations/images/{article_id[:3]}/{article_id}.jpg")
            plt.imshow(image)
        except:
            pass

# Top 20

In [None]:
top20.style.set_table_attributes("style='display:inline'").set_caption('Top 20')

In [None]:
show_images(top20.index.to_list(), 5)

# Bottom 20

In [None]:
bottom20.style.set_table_attributes("style='display:inline'").set_caption('Bottom 20')

In [None]:
show_images(bottom20.index.to_list(), 5)

# 4 Articles over time

In [None]:
transactions['d'] = 1
a = transactions[transactions.article_id == '0156231001'].groupby('t_dat').sum()['d']
b = transactions[transactions.article_id == '0760729003'].groupby('t_dat').sum()['d']
c = transactions[transactions.article_id == '0571041001'].groupby('t_dat').sum()['d']
d = transactions[transactions.article_id == '0919770001'].groupby('t_dat').sum()['d']

fig, axs = plt.subplots(2,2, figsize=(15,10))
axs[0,0].plot(pd.to_datetime(a.index), a)
axs[0,0].set_title('Box 4p Tights: 0.297305')
axs[0,0].tick_params(axis='x', rotation=45)
axs[0,1].plot(pd.to_datetime(b.index), b)
axs[0,1].set_title('Maj skirt: 0.0')
axs[0,1].tick_params(axis='x', rotation=45)
axs[1,0].plot(pd.to_datetime(c.index), c)
axs[1,0].set_title('Illaria: 0.283019')
axs[1,0].tick_params(axis='x', rotation=45)
axs[1,1].plot(pd.to_datetime(d.index), d)
axs[1,1].set_title('Kylie Denim Shirt Dress: 0.0')
axs[1,1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()