# Thanks for Visiting! 

In this notebook, I would like to share how my approach to evaluate generated candidates 

# 5 Things to be evaluated



1. Maximum MAP12: what if the candidate is already sorted by the relevancy (Ranking model is optimum)
2. Full covered customer: # customer that all their purchased articles inside the candidates
3. Not covered customer: # customer that nothing in their purchased articles inside the candidates 
4. Candidate multiplier: Ratio between rows in candidate dataframe and validation dataframe 
5. total unique article id: The lesser the better to optimize memory 

# Lets Code 

Lets start from the started one. 

Here we just generate top 100 items based on last week before validation week to be candidates items of all customer 

In [None]:
import numpy as np
import pandas as pd 
import gc

import cudf

In [None]:
transactions = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',
                            usecols= ['t_dat', 'customer_id', 'article_id'], 
                            dtype={'article_id': 'int32', 't_dat': 'string', 'customer_id': 'string'})
transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('int64')

submission = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv',
                        usecols=['customer_id'])
submission['customer_id'] = submission['customer_id'].str[-16:].str.hex_to_int().astype('int64')


train_start_date = '2020-09-09'
valid_start_date = '2020-09-16'

train_df = transactions.loc[((transactions['t_dat'] >= train_start_date) & (transactions['t_dat'] < valid_start_date)), 
                           ['customer_id', 'article_id']].drop_duplicates().reset_index(drop=True)
valid_df = transactions.loc[transactions['t_dat'] >= valid_start_date,
                           ['customer_id', 'article_id']].drop_duplicates().reset_index(drop=True)

In [None]:
valid_cust = valid_df['customer_id'].unique().to_pandas().to_list()
top_100_popular = train_df['article_id'].value_counts()[:100].index.to_pandas().to_list()

candidate_df = cudf.DataFrame({'customer_id': np.repeat(valid_cust, len(top_100_popular)),
                              'article_id': top_100_popular * len(valid_cust),
                              })

print(len(valid_cust), len(top_100_popular), len(candidate_df))

In [None]:
def candidate_score(candidate_df, valid_df):
    both_df = candidate_df.merge(valid_df, on = ['customer_id', 'article_id'], how = 'inner')

    # if the customer have >=12 just count 12 
    trgt_cnt_df = valid_df.groupby('customer_id', as_index = False).agg({'article_id' :'count'}).\
        rename(columns = {'article_id':'trgt_cnt'})
    trgt_cnt_df.loc[trgt_cnt_df['trgt_cnt']>= 12, 'trgt_cnt'] = 12 
    both_cnt_df = both_df.groupby('customer_id', as_index = False).agg({'article_id' :'count'}).\
        rename(columns = {'article_id':'both_cnt'})
    both_cnt_df.loc[both_cnt_df['both_cnt']>= 12, 'both_cnt'] = 12 

    trgt_cnt_df = trgt_cnt_df.merge(both_cnt_df, on = 'customer_id', how = 'left') 
    trgt_cnt_df.fillna(0, inplace = True)
    # assume it is optimally sorted 
    trgt_cnt_df['AP12'] = trgt_cnt_df['both_cnt'] / trgt_cnt_df['trgt_cnt']
    max_map12 = trgt_cnt_df['AP12'].mean()

    full_covered_cust = len(trgt_cnt_df.loc[trgt_cnt_df['AP12'] == 1])
    not_covered_cust = len(trgt_cnt_df.loc[trgt_cnt_df['AP12'] == 0])
    num_target_cust = len(trgt_cnt_df)

    num_candidate = len(candidate_df)
    num_target = len(valid_df)
    num_unq_artc = len(candidate_df['article_id'].unique())
    

    print(f"MAX MAP12: {round(max_map12, 4)}")
    print(f"Full Covered Customer: {round(full_covered_cust / num_target_cust, 4)} ({full_covered_cust} / {num_target_cust}) ")
    print(f"Not Covered Customer: {round(not_covered_cust / num_target_cust, 4)} ({not_covered_cust} / {num_target_cust}) ")
    print(f"Candidate multiplier: {round(num_candidate / num_target, 4)} ({num_candidate} / {num_target})")
    print(f"Unique Article Id: {num_unq_artc}")
    
    return max_map12, full_covered_cust / num_target_cust, not_covered_cust / num_target_cust, num_candidate / num_target ,num_unq_artc

In [None]:
candidate_score(candidate_df, valid_df)