# Repurchase
直近に購入したものを推薦する

In [1]:
import datetime

import numpy as np
import optuna
import pandas as pd

import schema
from metric import mapk

In [2]:
transactions = pd.read_csv('input/transformed/transactions_train.csv', parse_dates=['t_dat'], usecols=list(schema.TRANSACTIONS.keys())+['t_dat'], dtype=schema.TRANSACTIONS)
TOPK = 12

In [3]:
def repurchase_0(num_weeks, block_size=7):
    """
    そのユーザーが直近1週間に購入したものを購入数順に
    そのユーザーが直近2週間に購入したものを購入数順に
    そのユーザーが直近num_weeks週間に購入したものを購入数順に
    全ユーザーで直近num_weeks週間に購入された上位
    """
    valid_start_date = datetime.date(2020, 9, 16)
    valid_end_date = datetime.date(2020, 9, 22)
    transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")
    val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()
    def calc_week_pred(week):
        start_date = valid_start_date - datetime.timedelta(days=block_size*week)
        end_date = valid_start_date - datetime.timedelta(days=1)
        transactions_week = transactions.query("@start_date <= t_dat <= @end_date")
        week_pred = transactions_week.groupby(['customer_id_idx', 'article_id_idx']).size().reset_index(name='sz').sort_values(by=['customer_id_idx', 'sz'], ascending=False).groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()
        week_pred = week_pred.rename(columns={'article_id_idx': f'article_id_idx_{week}'})
        return week_pred
    week_preds = [calc_week_pred(week) for week in range(1, num_weeks + 1)]
    start_date = valid_start_date - datetime.timedelta(days=block_size*num_weeks)
    end_date = valid_start_date - datetime.timedelta(days=1)
    popular_articles = transactions.query("@start_date <= t_dat <= @end_date").groupby('article_id_idx').size().reset_index(name='sz').sort_values(by='sz', ascending=False)['article_id_idx'][:TOPK].tolist()

    pred = val[['customer_id_idx']]
    for idx, week_pred in enumerate(week_preds):
        week = idx + 1
        pred = pred.merge(week_pred, on='customer_id_idx', how='left')
        pred[f'article_id_idx_{week}'] = pred[f'article_id_idx_{week}'].fillna('').apply(list)
    pred['popular_articles'] = [popular_articles] * len(pred)
    pred['article_id_idx'] = pred['article_id_idx_1']
    for week in range(2, num_weeks+1):
        pred['article_id_idx'] += pred[f'article_id_idx_{week}']
    pred['article_id_idx'] += pred['popular_articles']
    pred = pred[['customer_id_idx', 'article_id_idx']]
    pred['article_id_idx'] = pred['article_id_idx'].apply(lambda x: list(dict.fromkeys(x))[:TOPK])
    return mapk(val.article_id_idx, pred['article_id_idx'])


def repurchase_1(num_weeks, block_size=7):
    """
    そのユーザーが1週間前から購入したものを購入数順に
    そのユーザーが2週間前から1週間前までに購入したものを購入数順に
    そのユーザーがnum_weeks週間前からnum_weeks-1週間前までに購入したものを購入数順に
    全ユーザーで直近num_weeks週間に購入された上位
    """
    valid_start_date = datetime.date(2020, 9, 16)
    valid_end_date = datetime.date(2020, 9, 22)
    transactions_valid = transactions.query("@valid_start_date <= t_dat <= @valid_end_date")
    val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()
    def calc_week_pred(week):
        start_date = valid_start_date - datetime.timedelta(days=block_size*week)
        end_date = valid_start_date - datetime.timedelta(days=block_size*(week-1)+1)
        transactions_week = transactions.query("@start_date <= t_dat <= @end_date")
        week_pred = transactions_week.groupby(['customer_id_idx', 'article_id_idx']).size().reset_index(name='sz').sort_values(by=['customer_id_idx', 'sz'], ascending=False).groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()
        week_pred = week_pred.rename(columns={'article_id_idx': f'article_id_idx_{week}'})
        return week_pred
    week_preds = [calc_week_pred(week) for week in range(1, num_weeks + 1)]
    start_date = valid_start_date - datetime.timedelta(days=block_size*num_weeks)
    end_date = valid_start_date - datetime.timedelta(days=1)
    popular_articles = transactions.query("@start_date <= t_dat <= @end_date").groupby('article_id_idx').size().reset_index(name='sz').sort_values(by='sz', ascending=False)['article_id_idx'][:TOPK].tolist()

    pred = val[['customer_id_idx']]
    for idx, week_pred in enumerate(week_preds):
        week = idx + 1
        pred = pred.merge(week_pred, on='customer_id_idx', how='left')
        pred[f'article_id_idx_{week}'] = pred[f'article_id_idx_{week}'].fillna('').apply(list)
    pred['popular_articles'] = [popular_articles] * len(pred)
    pred['article_id_idx'] = pred['article_id_idx_1']
    for week in range(2, num_weeks+1):
        pred['article_id_idx'] += pred[f'article_id_idx_{week}']
    pred['article_id_idx'] += pred['popular_articles']
    pred = pred[['customer_id_idx', 'article_id_idx']]
    pred['article_id_idx'] = pred['article_id_idx'].apply(lambda x: list(dict.fromkeys(x))[:TOPK])
    return mapk(val.article_id_idx, pred['article_id_idx'])

In [4]:
def objective_0(trial):
    num_weeks = trial.suggest_int('num_weeks', 1, 10)
    block_size = trial.suggest_int('block_size', 1, 14)
    return repurchase_0(num_weeks, block_size)

study_0 = optuna.create_study(direction='maximize')
study_0.optimize(objective_0, n_trials=50)
study_0.trials_dataframe().sort_values(by='value', ascending=False).head(20)

[32m[I 2022-03-05 17:25:15,813][0m A new study created in memory with name: no-name-582caa8d-4f9d-46cc-9450-eabd9c412ecc[0m
[32m[I 2022-03-05 17:25:23,067][0m Trial 0 finished with value: 0.02194956727912461 and parameters: {'num_weeks': 5, 'block_size': 2}. Best is trial 0 with value: 0.02194956727912461.[0m
[32m[I 2022-03-05 17:25:28,770][0m Trial 1 finished with value: 0.021366631302574827 and parameters: {'num_weeks': 2, 'block_size': 9}. Best is trial 0 with value: 0.02194956727912461.[0m
[32m[I 2022-03-05 17:26:17,436][0m Trial 2 finished with value: 0.020978134201580234 and parameters: {'num_weeks': 9, 'block_size': 10}. Best is trial 0 with value: 0.02194956727912461.[0m
[32m[I 2022-03-05 17:27:13,208][0m Trial 3 finished with value: 0.02074791869958917 and parameters: {'num_weeks': 9, 'block_size': 12}. Best is trial 0 with value: 0.02194956727912461.[0m
[32m[I 2022-03-05 17:27:45,964][0m Trial 4 finished with value: 0.021980519854883748 and parameters: {'num_

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_block_size,params_num_weeks,state
44,44,0.022172,2022-03-05 17:42:23.857567,2022-03-05 17:42:44.335747,0 days 00:00:20.478180,3,9,COMPLETE
42,42,0.022172,2022-03-05 17:41:50.973528,2022-03-05 17:42:11.970254,0 days 00:00:20.996726,3,9,COMPLETE
41,41,0.022172,2022-03-05 17:41:30.750386,2022-03-05 17:41:50.972374,0 days 00:00:20.221988,3,9,COMPLETE
40,40,0.022172,2022-03-05 17:41:09.517107,2022-03-05 17:41:30.749114,0 days 00:00:21.232007,3,9,COMPLETE
37,37,0.022172,2022-03-05 17:40:12.708983,2022-03-05 17:40:33.170966,0 days 00:00:20.461983,3,9,COMPLETE
38,38,0.022172,2022-03-05 17:40:33.172134,2022-03-05 17:40:53.702355,0 days 00:00:20.530221,3,9,COMPLETE
47,47,0.022131,2022-03-05 17:43:24.178570,2022-03-05 17:43:48.817157,0 days 00:00:24.638587,3,10,COMPLETE
46,46,0.022131,2022-03-05 17:43:00.180506,2022-03-05 17:43:24.177590,0 days 00:00:23.997084,3,10,COMPLETE
22,22,0.022104,2022-03-05 17:33:52.968379,2022-03-05 17:34:14.493748,0 days 00:00:21.525369,4,8,COMPLETE
25,25,0.022075,2022-03-05 17:34:47.258711,2022-03-05 17:35:11.883229,0 days 00:00:24.624518,5,8,COMPLETE


In [5]:
def objective_1(trial):
    num_weeks = trial.suggest_int('num_weeks', 1, 10)
    block_size = trial.suggest_int('block_size', 1, 14)
    return repurchase_1(num_weeks, block_size)

study_1 = optuna.create_study(direction='maximize')
study_1.optimize(objective_1, n_trials=50)
study_1.trials_dataframe().sort_values(by='value', ascending=False).head(20)

[32m[I 2022-03-05 17:45:00,030][0m A new study created in memory with name: no-name-9f470ad6-cfb5-4ca7-8b5c-078f6c7c1616[0m
[32m[I 2022-03-05 17:45:04,915][0m Trial 0 finished with value: 0.021254400313146304 and parameters: {'num_weeks': 2, 'block_size': 12}. Best is trial 0 with value: 0.021254400313146304.[0m
[32m[I 2022-03-05 17:45:07,827][0m Trial 1 finished with value: 0.021423603443038914 and parameters: {'num_weeks': 1, 'block_size': 10}. Best is trial 1 with value: 0.021423603443038914.[0m
[32m[I 2022-03-05 17:45:10,651][0m Trial 2 finished with value: 0.01607632149779255 and parameters: {'num_weeks': 3, 'block_size': 1}. Best is trial 1 with value: 0.021423603443038914.[0m
[32m[I 2022-03-05 17:45:18,447][0m Trial 3 finished with value: 0.02170780561851498 and parameters: {'num_weeks': 4, 'block_size': 9}. Best is trial 3 with value: 0.02170780561851498.[0m
[32m[I 2022-03-05 17:45:22,711][0m Trial 4 finished with value: 0.021402888358014575 and parameters: {'n

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_block_size,params_num_weeks,state
22,22,0.022172,2022-03-05 17:47:55.535060,2022-03-05 17:48:04.755606,0 days 00:00:09.220546,3,9,COMPLETE
44,44,0.022172,2022-03-05 17:51:17.265914,2022-03-05 17:51:25.841909,0 days 00:00:08.575995,3,9,COMPLETE
41,41,0.022172,2022-03-05 17:50:52.779402,2022-03-05 17:51:01.462632,0 days 00:00:08.683230,3,9,COMPLETE
19,19,0.022172,2022-03-05 17:47:23.925413,2022-03-05 17:47:32.824907,0 days 00:00:08.899494,3,9,COMPLETE
18,18,0.022172,2022-03-05 17:47:15.088199,2022-03-05 17:47:23.924188,0 days 00:00:08.835989,3,9,COMPLETE
31,31,0.022172,2022-03-05 17:49:19.514912,2022-03-05 17:49:28.756829,0 days 00:00:09.241917,3,9,COMPLETE
21,21,0.022172,2022-03-05 17:47:46.782271,2022-03-05 17:47:55.533821,0 days 00:00:08.751550,3,9,COMPLETE
12,12,0.022149,2022-03-05 17:46:23.498306,2022-03-05 17:46:34.241448,0 days 00:00:10.743142,4,10,COMPLETE
43,43,0.022131,2022-03-05 17:51:07.639648,2022-03-05 17:51:17.264640,0 days 00:00:09.624992,3,10,COMPLETE
33,33,0.022124,2022-03-05 17:49:35.901673,2022-03-05 17:49:46.043297,0 days 00:00:10.141624,4,9,COMPLETE


ノートブックのやつは0.0237くらいでるが、今は出ていないのは、ユーザー共通人気商品の区間を1wにしているか、最長区間にしているかの違い