# Kaggle Setup

In [None]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d tbierhance/hm-fashion-recommendation-parquet

Downloading hm-fashion-recommendation-parquet.zip to /content
 96% 234M/244M [00:05<00:00, 91.0MB/s]
100% 244M/244M [00:05<00:00, 46.1MB/s]


In [None]:
!unzip hm-fashion-recommendation-parquet.zip

Archive:  hm-fashion-recommendation-parquet.zip
  inflating: articles.parquet        
  inflating: customer_ids.parquet    
  inflating: customers.parquet       
  inflating: sales.parquet           
  inflating: sample_submission.parquet  


# Data Upload

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm

In [None]:
articles = pd.read_parquet('articles.parquet')
sales = pd.read_parquet('sales.parquet')
customers = pd.read_parquet('customers.parquet')

# Some EDA

In [None]:
sales_per_week = sales.groupby(['article_id', 'week'], as_index=False).agg(unit_sales=('price', 'size')).sort_values('week')
sales_per_week['last_purchase_week'] = sales_per_week.groupby('article_id').week.diff()
sales_per_week.loc[sales_per_week.last_purchase_week.isna(), 'last_purchase_week'] = 0
sales_per_week['last_purchase_week_bin'] = pd.cut(sales_per_week.last_purchase_week, bins=[0, 1, 2, 3, 4, 5, 105], labels=['new', '1 week before', '2 weeks before', '3 weeks before', '4 weeks before', '>=5 weeks before'], right=False)

In [None]:
# number of unique articles per week
sales_per_week = sales_per_week.groupby(['week', 'last_purchase_week_bin']).agg(unit_sales=('unit_sales', 'sum'), article_count=('article_id', pd.Series.nunique)).reset_index()
sales_per_week['unit_sales_pct'] = sales_per_week['unit_sales']/sales_per_week.groupby('week').unit_sales.transform('sum')*100
px.bar(sales_per_week, x='week', y='article_count', color='last_purchase_week_bin', title='Number of unique articles per week')

In [None]:
px.bar(sales_per_week, x='week', y='unit_sales', color='last_purchase_week_bin', title='Unit sales per week grouped by week of last purchase')

In [None]:
# To further note
tmp = (sales_per_week[sales_per_week.week>5].groupby('last_purchase_week_bin').mean()).reset_index()
tmp['unit_sales_pct_cum'] = tmp['unit_sales_pct'].cumsum()
tmp['article_count_cum'] = tmp['article_count'].cumsum().astype('int')
tmp[['last_purchase_week_bin', 'unit_sales_pct', 'unit_sales_pct_cum', 'article_count_cum']]

Unnamed: 0,last_purchase_week_bin,unit_sales_pct,unit_sales_pct_cum,article_count_cum
0,new,2.422093,2.422093,703
1,1 week before,95.17606,97.598153,17034
2,2 weeks before,1.194241,98.792394,18711
3,3 weeks before,0.413603,99.205997,19405
4,4 weeks before,0.211029,99.417026,19776
5,>=5 weeks before,0.582974,100.0,20829


In [None]:
# for customers pattern
articles_per_week = sales.groupby(['article_id', 'week'], as_index=False).agg(unit_sales=('price', 'size')).sort_values('week')
sales_per_week = sales.groupby(['customer_id', 'week'], as_index=False).agg(unit_sales=('price', 'size')).sort_values('week')
sales_per_week['last_purchase_week'] = sales_per_week.groupby('customer_id').week.diff()
sales_per_week.loc[sales_per_week.last_purchase_week.isna(), 'last_purchase_week'] = 0
sales_per_week['last_purchase_week_bin'] = pd.cut(
    sales_per_week.last_purchase_week,
    bins=[0,
          1, 2,
          4, 9, 17, 26, 52,
          205],
    labels=['new',
            '1 week before', '2 weeks before',
            '1 month before', '2-3 months before', '4-6 months before', '6-12 months before',
            '>=1 year before'], right=False)

In [None]:
sales_per_week_plot = sales_per_week.groupby(['week', 'last_purchase_week_bin']).agg(unit_sales=('unit_sales', 'sum'), customer_count=('customer_id', pd.Series.nunique)).reset_index()
sales_per_week_plot['unit_sales_pct'] = sales_per_week_plot['unit_sales']/sales_per_week_plot.groupby('week').unit_sales.transform('sum')*100
px.bar(sales_per_week_plot, x='week', y='customer_count', color='last_purchase_week_bin', title='Number of unique customers per week')

In [None]:
# for further note
tmp = (sales_per_week_plot[sales_per_week_plot.week>40].groupby('last_purchase_week_bin').mean()).reset_index()
tmp['unit_sales_pct_cum'] = tmp['unit_sales_pct'].cumsum()
tmp['customer_count_cum'] = tmp['customer_count'].cumsum().astype('int')
tmp[['last_purchase_week_bin', 'unit_sales_pct', 'unit_sales_pct_cum', 'customer_count_cum']]

Unnamed: 0,last_purchase_week_bin,unit_sales_pct,unit_sales_pct_cum,customer_count_cum
0,new,8.732496,8.732496,6891
1,1 week before,17.541619,26.274116,20742
2,2 weeks before,19.18564,45.459756,35443
3,1 month before,24.33991,69.799666,53101
4,2-3 months before,15.142405,84.942071,63885
5,4-6 months before,6.866222,91.808294,68870
6,6-12 months before,6.5032,98.311494,73628
7,>=1 year before,1.688506,100.0,75140


# Train and Test Estimate using popularity

From the above EDA, as we want to predict the top 12 articles for each customer at week 94 (the first week of 2020.07), the most articles will be from last month. Thus, our train set will be all sales from the last month, by using popularity, we are only selecting the top 12 articles from last month, which will be the estimated articles for the customers at week 94.

In [None]:
# MAP @ k
def average_precision_score(y_true, y_score, k=None):
    if k is None: k=len(y_score)
    relevant = np.isin(y_score[:k], y_true) # relevant[i]==1 if y_score[i] is correct
    patk = np.cumsum(relevant)/np.arange(1, len(y_score[:k])+1) # patk[0]==P@1, patk[1]==P@2, ...
    return(np.sum(patk*relevant)/min(len(y_true), k)) # as defined by the competition

In [None]:
# train and test dataset

# get the best selling articles in the month before the prediction
# validation week: week 94, just use the same as test week here, can choose any week(s), should use cross validation
train = sales[sales.week.between(93-7,93-1)]
# find the top 12 articles
best_sellers = train.groupby('article_id').size().nlargest(12).index.values
# only include customers that bought something in the week to predict
test = sales[sales.week==93].groupby('customer_id').article_id.unique().reset_index(name='y_true')
# use best selling articles for the prediction for every customer
test['y_score'] = test.apply(lambda x: best_sellers, axis=1)
# calculate AP@12 for every customer
test['ap@12'] = test.apply(lambda row: average_precision_score(row['y_true'], row['y_score']), axis=1)
# MAP@!2 over all customers
test['ap@12'].mean()

0.0022710656821311923

# finding: popularity using only articles from last week have a higher accuracy then using articles from last month