In this notebook, I would like to make a submission whose score is probably around 0.007.   

Basically, I will follow [this notebook](https://www.kaggle.com/julian3833/h-m-content-based-12-most-popular-items-0-003) and adopt the "*recommend the most popular items*" approach.  
What I'm going to try here is some adjustments of the popularity using two kinds of information; age and time.  

I'm glad if you could find something useful to you.  

In [None]:
import numpy as np, pandas as pd, datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from collections import Counter, defaultdict
from PIL import Image
from pathlib import Path
path = Path("/kaggle/input/h-and-m-personalized-fashion-recommendations/")

def show_images(article_ids, cols=1, texts=[], suptitle=''):
    if isinstance(article_ids, int) or isinstance(article_ids, str):
        article_ids = [article_ids]
    rows = (len(article_ids) // cols) + 1
    plt.figure(figsize=(3 + 3.5 * cols, 3 + 5 * rows))
    for i, article_id in enumerate(article_ids):
        article_id = ("0" + str(article_id))[-10:]
        text = '' if len(texts) <= i else ('\n' + texts[i])
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        plt.title(f"{article_id}{text}", fontsize=16)
        try:
            image = Image.open(f"/kaggle/input/h-and-m-personalized-fashion-recommendations/images/{article_id[:3]}/{article_id}.jpg")
            plt.imshow(image)
        except:
            pass
    if suptitle != '': plt.suptitle(suptitle, fontsize=36, fontweight='bold')
    plt.tight_layout()

def iter_to_str(iterable):
    return " ".join(map(lambda x: str(0) + str(x), iterable))

def apk(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]
    score, nhits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            nhits += 1.0
            score += nhits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=12, return_apks=False):
    assert len(actual) == len(predicted)
    apks = [apk(ac, pr, k) for ac, pr in zip(actual, predicted) if 0 < len(ac)]
    if return_apks:
        return apks
    return np.mean(apks)

df = pd.read_parquet('../input/hm-parquets-of-datasets/transactions_train.parquet')
cdf = pd.read_parquet('../input/hm-parquets-of-datasets/customers.parquet')
sub = pd.read_csv(path / 'sample_submission.csv')

valid_week = 105 # number of week to be used in a validation
valid = df[df.week == valid_week].groupby('customer_id').article_id.apply(iter_to_str).reset_index()\
    .merge(cdf['customer_id'], on='customer_id', how='right')
actual = valid.article_id.apply(lambda s: [] if pd.isna(s) else s.split())
last_date = df[df.week < valid_week]['t_dat'].max()

# Age-adjusted and Time-discounted Popular Items

### Time-discounting
Since H&M is a fast-fashion brand and time is one of the most important key of the competition, here I use only the data of last 21 days, adjusting the weights of each transaction according to how many days old the transaction is.  
The weight of transactions in the t-th day from the last day is calculated as follows.
$$
weight(t) = \frac{1}{t ^ {1.4}}
$$
### Age-Adjustment
Then, a recommendation for x-year-old customers can be created as 12 items that are popular in customers whose age are in the range from x-w to x+w.

In [None]:
init_date = last_date - dt.timedelta(days=21 - 1)
train = df.loc[(df.t_dat >= init_date) & (df.t_dat <= last_date)].copy()

# time discount
train['factor'] = (1 / ((last_date - train['t_dat']).dt.days + 1)) ** 1.4

# replace NA values of the customer's age with the mean
cdf.loc[pd.isna(cdf['age']), 'age'] = cdf['age'].mean()

# add age information to transactions
train = train.merge(cdf[['customer_id', 'age']], on='customer_id', how='left')

def age_adjusted_popular_items(x, width, k=12):
    temp = train[(x - width <= train['age']) & (train['age'] <= x + width)].reset_index()
    recommend = iter_to_str(temp.groupby('article_id')['factor'].sum().nlargest(k).index.to_list())
    return recommend

# widths for each age
width_dict = defaultdict(int)
for x in range(22): width_dict[x] = 2
for x in range(22, 30): width_dict[x] = 3
for x in range(30, 36): width_dict[x] = 4
for x in range(36, 45): width_dict[x] = 5
for x in range(45, 50): width_dict[x] = 5
for x in range(50, 60): width_dict[x] = 6
for x in range(60, 100): width_dict[x] = 10

recommend = defaultdict(str)    
for x in cdf['age'].unique():
    recommend[x] = age_adjusted_popular_items(x, width_dict[x])

sub['prediction'] = cdf['age'].map(recommend)

In [None]:
for age in range(18, 70, 8):
    show_images(recommend[age].split(), 12, suptitle=f'Recommendation for {age}-year-old Customers')

# Submission

In [None]:
display(sub.head())
sub.to_csv("submission.csv", index=False)