# H&M:CV_Tutorial_FasterTrendingProductsWeekly[JP/EN]

ノートブックを見ていただきありがとうございます。　　

このノートブックでは、[FasterTrendingProductsWeekly](https://www.kaggle.com/code/hervind/h-m-faster-trending-products-weekly/notebook)の予測モデルを3つのCV(交差検証)によって結果の比較を行います。

- fold 0

    train <= '2020-09-15'
    
    valid >= '2020-09-16'
    
- fold 1

    train <= '2020-09-08'
    
    valid >= '2020-09-09' & valid <= '2020-09-15' 

- fold 2

    train <= '2020-09-01'
    
    valid >= '2020-09-02' & valid <= '2020-09-08' 
    
Reference:[How To Setup Local CV](https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308919)


また、予測モデルを関数に置き換えているので使い易くしています。

ーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーー

Thank you for taking a look at the notebook. It was

This notebook compares the results of a predictive model from with three cross-validations.

It is also easy to use because it replaces the predictive model with a function.

ーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーー

If this notebook helps you, please comment and up votes.:)

## 1. Library
このノートブックではGPU（cudf）を使います。

This notebook uses GPU(cudf).

In [None]:
import numpy as np
import pandas as pd
import gc
import os
import time
import random
from tqdm.auto import tqdm
import cudf

In [None]:
def visualize_df(df):
    print(df.shape)
    display(df.head())

## 2. Function

- Reference

[H&M : How to calculate MAP@12](https://www.kaggle.com/code/kaerunantoka/h-m-how-to-calculate-map-12)

In [None]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def train_valid_split(transactions, nfold=0):

    if nfold==1:
        train_end_date = '2020-09-08'
        val_start_date,val_end_date = '2020-09-09', '2020-09-15'
    elif nfold==2:
        train_end_date = '2020-09-01'
        val_start_date,val_end_date = '2020-09-02', '2020-09-08'
    else:
        train_end_date = '2020-09-15'
        val_start_date,val_end_date = '2020-09-16', '2020-09-22'
    
    print(f'TrainEndDate:{train_end_date}')
    print(f'ValidDate:{val_start_date} to {val_end_date}')
    
    train_data = transactions.query(f"t_dat <= '{train_end_date}'").reset_index(drop=True)
    valid_data = transactions.query(f"t_dat >= '{val_start_date}' & t_dat <='{val_end_date}'").reset_index(drop=True)
        
    visualize_df(train_data)
    visualize_df(valid_data)
    
    return train_data, valid_data
        
    

    

## 3. Make Dataset

In [None]:
!mkdir nfold

In [None]:
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
visualize_df(transactions)

In [None]:
folds=[0, 1, 2]

for nfold in folds:
    train_data, valid_data = train_valid_split(transactions, nfold=nfold)
    train_data.to_csv(f'nfold/train_fold{str(nfold)}.csv', index=False)
    valid_data.to_csv(f'nfold/valid_fold{str(nfold)}.csv', index=False)
    
    del train_data, valid_data
    gc.collect()

## 4. Prepare Velidation Data

In [None]:
nfold = 0

sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv', usecols=['customer_id'])
valid_data = pd.read_csv(f'./nfold/valid_fold{str(nfold)}.csv')

valid_unq = valid_data.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_unq['valid_true'] = valid_unq['article_id'].map(lambda x: '0'+' 0'.join(str(x)[1:-1].split(', ')))

df_mapk = pd.merge(sub, valid_unq[['customer_id', 'valid_true']], on=['customer_id'])
visualize_df(df_mapk)

## 5.Prediction

- Reference
[H&M: Faster Trending Products Weekly](https://www.kaggle.com/code/hervind/h-m-faster-trending-products-weekly/notebook)

In [None]:
def pred_weekly(trainpath, N=12):
    df = cudf.read_csv(trainpath,
             usecols=['t_dat', 'customer_id', 'article_id'],
             dtype={'article_id':'int32', 't_dat':'string', 'customer_id':'string'})
    
    df['t_dat'] = cudf.to_datetime(df['t_dat'])
    df ['customer_id'] = df ['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    
    last_ts = df['t_dat'].max()
    
    tmp = df[['t_dat']].copy().to_pandas()
    tmp['dow'] = tmp['t_dat'].dt.dayofweek
    tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
    tmp.loc[tmp['dow'] >=2 , 'ldbw'] = tmp.loc[tmp['dow'] >=2 , 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2])) * 7, unit='D')

    df['ldbw'] = tmp['ldbw'].values
    
    
    weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count().reset_index()
    weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
    
    df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how = 'left')
    
    
    weekly_sales = weekly_sales.reset_index().set_index('article_id')

    df = df.merge(
        weekly_sales.loc[weekly_sales['ldbw']==last_ts, ['count']],
        on='article_id', suffixes=("", "_targ"))

    df['count_targ'].fillna(0, inplace=True)
    del weekly_sales
    
    df['quotient'] = df['count_targ'] / df['count']
    
    target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
    general_pred = target_sales.nlargest(N).index.to_pandas().tolist()
    general_pred = ['0' + str(article_id) for article_id in general_pred]
    general_pred_str =  ' '.join(general_pred)
    del target_sales
    

    purchase_dict = {}

    tmp = df.copy().to_pandas()
    tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1, 'D')).astype(int)
    tmp['dummy_1'] = 1 
    tmp['x'] = tmp[["x", "dummy_1"]].max(axis=1)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c*tmp['x']) - d

    tmp['dummy_0'] = 0 
    tmp['y'] = tmp[["y", "dummy_0"]].max(axis=1)
    tmp['value'] = tmp['quotient'] * tmp['y'] 

    tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
    tmp = tmp.reset_index()

    tmp = tmp.loc[tmp['value'] > 100]
    tmp['rank'] = tmp.groupby("customer_id")["value"].rank("dense", ascending=False)
    tmp = tmp.loc[tmp['rank'] <= 12]

    # for customer_id in tmp['customer_id'].unique():
    #     purchase_dict[customer_id] = {} 

    # for customer_id, article_id, value in zip(tmp['customer_id'], tmp['article_id'], tmp['value']):
    #     purchase_dict[customer_id][article_id] = value

    purchase_df = tmp.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
    purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
    purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
    purchase_df['prediction'] = purchase_df['prediction'].str.strip()
    purchase_df = cudf.DataFrame(purchase_df)
    
    
    
    sub  = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv',
                            usecols= ['customer_id'], 
                            dtype={'customer_id': 'string'})

    sub['customer_id2'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')

    sub = sub.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left', suffixes = ('', '_ignored'))

    sub = sub.to_pandas()
    sub['prediction'] = sub['prediction'].fillna(general_pred_str)
    sub['prediction'] = sub['prediction'] + ' ' +  general_pred_str
    sub['prediction'] = sub['prediction'].str.strip()
    sub['prediction'] = sub['prediction'].str[:131]
    sub = sub[['customer_id', 'prediction']]
    
    return sub

In [None]:
trainpath = f'./nfold/train_fold{str(nfold)}.csv'
pred_df = pred_weekly(trainpath=trainpath)

In [None]:
df_mapk = pd.merge(df_mapk, pred_df, on=['customer_id'], how='left')
visualize_df(df_mapk)

In [None]:
tqdm.pandas()

mapk(
    df_mapk['valid_true'].map(lambda x: x.split()), 
    df_mapk['prediction'].map(lambda x: x.split()), 
    k=12
)

## 5.Conclusion

- Result(CV)
||fold0|fold1|fold2|
|:-:|:-:|:-:|:-:|
|CV|0.0236|0.0233|0.0224|