# Import

In [1]:
from lightgbm.sklearn import LGBMRanker
import lightgbm as lgb
from datetime import timedelta
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime, date, timedelta

In [2]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

# Read

In [4]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

DEBUG = True
PRE_YEAR = False
SEASON = True # 負例の作成方法を検討してから。
AGE = True
POSITIVE = False
BIG = True
FEATURE = True

In [5]:
transactions = pd.read_parquet(os.path.join(OUTPUT_DIR, 'transactions_train.parquet'))
customers = pd.read_parquet(os.path.join(OUTPUT_DIR, 'customers.parquet'))
articles = pd.read_parquet(os.path.join(OUTPUT_DIR, 'articles.parquet'))

In [6]:
test_week = transactions.week.max() + 1
valid_week = test_week-1
if DEBUG:
    if PRE_YEAR:
        tmp = transactions[transactions.week.isin(list(range(43, 53)))]
        transactions = transactions[transactions.week > transactions.week.max() - 10]
        transactions = pd.concat([tmp, transactions])
    else:
        if BIG:
            transactions = transactions[transactions.week > transactions.week.max() - 32]
        else:
            transactions = transactions[transactions.week > transactions.week.max() - 10]

# Make Candidates

In [7]:
c2weeks = transactions.groupby('customer_id')['week'].unique()

In [8]:
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
c2weeks2shifted_weeks[272412481300040]

{79: 81, 81: 84, 84: 86, 86: 91, 91: 95, 95: 96, 96: 103, 103: 105}

In [9]:
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks # 未来(次に商品を購入した週もしくはtestweekを入れている)←前回買ったものはこれだよという候補を示している

# Bestsellers candidates

In [10]:
def prepare_bestseller_candidates(transactions):
    if POSITIVE:
        k = 24
    else:
        k = 12
    mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
    sales = transactions \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(k).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1 # 翌週に使うため
    # 各週、各顧客についてユニーク
    unique_transactions = transactions \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()

    # 前の週によく売れたものを候補にしている
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )

    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )

    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
    return candidates_bestsellers, bestsellers_previous_week

if AGE:
    tmp = transactions.merge(customers[['customer_id', 'age']], on='customer_id')
#     tmp = transactions.merge(customers[['customer_id', 'club_member_status']], on='customer_id')
    transactions_35 = tmp[tmp['age']>60]
    transactions_20 = tmp[tmp['age']<=60]
#     transactions_35 = tmp[tmp['club_member_status']!='ACTIVE']
#     transactions_20 = tmp[tmp['club_member_status']=='ACTIVE']
    candidates_bestsellers_20, bestsellers_previous_week_20 = prepare_bestseller_candidates(transactions_20)
    candidates_bestsellers_35, bestsellers_previous_week_35 = prepare_bestseller_candidates(transactions_35)

    candidates_bestsellers = pd.concat([candidates_bestsellers_20, candidates_bestsellers_35])
    candidates_bestsellers.drop('age', axis=1, inplace=True)
#     candidates_bestsellers.drop('club_member_status', axis=1, inplace=True)

    bestsellers_previous_week = pd.concat([bestsellers_previous_week_20, bestsellers_previous_week_35]) # 20代の方優先
    bestsellers_previous_week = bestsellers_previous_week.drop_duplicates(['week', 'article_id'])
#     del candidates_bestsellers_20, bestsellers_previous_week_20, candidates_bestsellers_35, bestsellers_previous_week_35
#     del transactions_35, transactions_20, tmp
else:
    candidates_bestsellers, bestsellers_previous_week = prepare_bestseller_candidates(transactions)

# Season Candidates

In [11]:
if SEASON:
#     winter_no = [252, 245, 71, 264, 262, 305, 80, 263, 496, 303, 349]
    winter_no = [252, 245]
#     summer_no = [298, 59, 57, 255, 274, 91, 257, 299, 60]
    summer_no = [255, 274, 257]

    articles['season'] = 0
    articles.loc[articles['product_type_no'].isin(winter_no), 'season'] = -1
    articles.loc[articles['product_type_no'].isin(summer_no), 'season'] = 1
    articles.head()

In [12]:
if SEASON:
    sales = transactions.groupby(['week', 'article_id'])[['article_id']].count()
    sales.columns = ['count']
    sales = sales[sales['count']>300].reset_index(drop=False)
    season_sales = sales.merge(articles[['article_id', 'season']], on=['article_id'], how='left')
    season_sales_previous_week = season_sales[season_sales['season']!=0]
    
    season_sales_previous_week = season_sales_previous_week.groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('seasonseller_rank').astype('int8').reset_index(drop=False)
    season_sales_previous_week = season_sales_previous_week.merge(articles[['article_id', 'season']], on=['article_id'], how='left')
    
    mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
    season_sales_previous_week = pd.merge(season_sales_previous_week, mean_price, on=['week', 'article_id'])
    season_sales_previous_week.week += 1 # 翌週に使うため
    season_sales_previous_week.head()

In [13]:
if SEASON:
    # 各週、各顧客についてユニーク
    unique_transactions = transactions \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    unique_transactions['month'] = unique_transactions['t_dat'].dt.month
    # articlesの方と逆にしておく。
    unique_transactions['season'] = -1
    unique_transactions.loc[(unique_transactions['month']>=2) & (unique_transactions['month']<8), 'season'] = 1

    # 季節物マージ
    candidates_season_sales = pd.merge(
        unique_transactions,
        season_sales_previous_week,
        on=['week', 'season'],
    )

In [14]:
if SEASON:
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = test_week
    candidates_season_sales_test_week = pd.merge(
        test_set_transactions,
        season_sales_previous_week,
        on=['week', 'season']
    )

In [15]:
if SEASON:
    candidates_season_sales = pd.concat([candidates_season_sales_test_week, candidates_season_sales])
    candidates_season_sales.drop(columns=['month', 'season', 'seasonseller_rank'], inplace=True)

# Combining transactions and candidates / negative examples

In [16]:
if POSITIVE:
    dfs = [candidates_last_purchase, candidates_bestsellers]

    if SEASON:
        dfs += [candidates_season_sales]

    data = pd.concat(dfs)
    transactions['purchased'] = 1
    positive = transactions[['t_dat', 'customer_id', 'article_id', 'week', 'purchased']].drop_duplicates()
    data = data.merge(positive, on=['t_dat', 'customer_id', 'article_id', 'week'], how='left') # 紐づかないものがnegativeになる
    data.purchased.fillna(0, inplace=True)

    data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
else:
    transactions['purchased'] = 1
    dfs = [transactions, candidates_last_purchase, candidates_bestsellers]

    if SEASON:
        dfs += [candidates_season_sales]

    data = pd.concat(dfs)
    data.purchased.fillna(0, inplace=True)

    # 購入されていたら、0の方が消される
    data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [17]:
data.purchased.mean() # 0.001980958883330536

0.12668658240629938

In [18]:
len(data[data.purchased==1].customer_id.unique()) # 28622

806957

In [19]:
len(data.customer_id.unique())

806957

In [20]:
def feature_engineering(data, is_train=True):
    # 前の週によく売れた商品かどうかわかるようにマージしている(top12)
    data = pd.merge(
        data,
        bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
    if is_train:
        data = data[data.week != data.week.min()]
        if PRE_YEAR:
            data = data[data.week != 95]
    data.bestseller_rank.fillna(999, inplace=True)
    data = pd.merge(data, articles, on='article_id', how='left')
    data = pd.merge(data, customers, on='customer_id', how='left')
    data['month'] = data['t_dat'].dt.month
    
    if FEATURE:
        user_price_agg = pd.read_parquet(os.path.join(OUTPUT_DIR, 'user_price_agg.parquet'))
        data = data.merge(user_price_agg, on=['customer_id', 'week'], how='left')
        del user_price_agg
        data[['agg_price_mean', 'agg_price_max', 'agg_price_count']] = data[['agg_price_mean', 'agg_price_max', 'agg_price_count']].fillna(0)
        data['max_price_diff'] = data['price'] - data['agg_price_max']
        
        user_article_counts = pd.read_parquet(os.path.join(OUTPUT_DIR, 'user_article_counts.parquet'))
        data = data.merge(user_article_counts, on=['customer_id', 'article_id', 'week'], how='left')
        del user_article_counts
        data['user_article_count'] = data['user_article_count'].fillna(0)

    return data

data = feature_engineering(data)
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,fashion_news_frequency,age,postal_code,month,agg_price_mean,agg_price_max,agg_price_count,week_diff,max_price_diff,user_article_count
0,2020-02-19,2042145971988316,436083002,0.011847,1,74,1.0,999.0,436083,19599,...,0,27,174283,2,0.204644,0.238881,12.0,3.0,-0.227034,0.0
1,2020-02-19,3050367810001883,683864001,0.005966,1,74,1.0,999.0,683864,3242,...,1,58,98825,2,0.288663,0.371,28.0,17.0,-0.365034,0.0
2,2020-02-19,3050367810001883,762846006,0.022424,1,74,1.0,999.0,762846,472,...,1,58,98825,2,0.288663,0.371,28.0,17.0,-0.348576,0.0
3,2020-02-19,3050367810001883,762846001,0.022407,1,74,1.0,999.0,762846,472,...,1,58,98825,2,0.288663,0.371,28.0,17.0,-0.348593,0.0
4,2020-02-19,4724881055684564,751471027,0.028797,1,74,1.0,999.0,751471,111,...,1,19,54980,2,0.08291,0.105237,13.0,23.0,-0.076441,0.0


# Make Valid Data

In [21]:
valid_transactions = transactions[transactions.week == valid_week]
valid_candidates_last_purchase = candidates_last_purchase[candidates_last_purchase.week==valid_week]
valid_candidates_bestsellers = candidates_bestsellers[candidates_bestsellers.week==valid_week]
if SEASON:
    valid_candidates_season_sales = candidates_season_sales[candidates_season_sales.week==valid_week]

valid_true = valid_transactions[['customer_id', 'article_id']].copy() # 正例

In [22]:
dfs = [valid_candidates_last_purchase, valid_candidates_bestsellers]
if SEASON:
    dfs += [valid_candidates_season_sales]

valid = pd.concat(dfs)
valid.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
valid.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
22278313,2020-02-12,1064402967034756590,850917001,0.023153,1,104
22278314,2020-02-12,1064402967034756590,850917002,0.023153,1,104
22278315,2020-02-12,1064402967034756590,762856014,0.023136,1,104
22289718,2020-02-12,1409103460956453897,820308002,0.016932,1,104
22292922,2020-02-12,4140988305631444517,822862001,0.050831,2,104


In [23]:
valid = valid.merge(valid_transactions[['customer_id', 'article_id', 'purchased']], on=['customer_id', 'article_id'], how='left')
valid['purchased'] = valid['purchased'].fillna(0)
valid.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
0,2020-02-12,1064402967034756590,850917001,0.023153,1,104,0.0
1,2020-02-12,1064402967034756590,850917002,0.023153,1,104,0.0
2,2020-02-12,1064402967034756590,762856014,0.023136,1,104,0.0
3,2020-02-12,1409103460956453897,820308002,0.016932,1,104,0.0
4,2020-02-12,4140988305631444517,822862001,0.050831,2,104,0.0


In [24]:
# 正しい候補(customer_id*article_id)を持って来れてないと少なくなる
print(len(valid[valid['purchased']==1].customer_id.unique()))
print(len(valid.customer_id.unique()))

9838
68984


In [25]:
7819
68984

68984

# 実験開始

In [26]:
# valid = pd.concat([valid_candidates_last_purchase, valid_candidates_bestsellers, valid_candidates_season_sales])
# valid.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
# valid['is_merged'] = 1

In [27]:
# tmp = valid_transactions[['customer_id', 'article_id']].merge(valid[['customer_id', 'article_id', 'is_merged']], on=['customer_id', 'article_id'], how='left')
# tmp['is_merged'] = tmp['is_merged'].fillna(0)
# tmp.groupby('is_merged').count()

In [28]:
# tmp2 = tmp[tmp['is_merged']==0].groupby('article_id')[['customer_id']].count()
# tmp2.sort_values('customer_id', ascending=False)

# 終了

In [29]:
valid = feature_engineering(valid, is_train=False)
valid.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,fashion_news_frequency,age,postal_code,month,agg_price_mean,agg_price_max,agg_price_count,week_diff,max_price_diff,user_article_count
0,2020-02-12,1064402967034756590,850917001,0.023153,1,104,0.0,999.0,850917,2444,...,1,63,16708,2,0.306935,0.427254,54.0,31.0,-0.404102,0.0
1,2020-02-12,1064402967034756590,850917002,0.023153,1,104,0.0,999.0,850917,2444,...,1,63,16708,2,0.306935,0.427254,54.0,31.0,-0.404102,0.0
2,2020-02-12,1064402967034756590,762856014,0.023136,1,104,0.0,999.0,762856,1977,...,1,63,16708,2,0.306935,0.427254,54.0,31.0,-0.404119,0.0
3,2020-02-12,1409103460956453897,820308002,0.016932,1,104,0.0,999.0,820308,1014,...,0,20,43326,2,0.240559,0.262593,11.0,31.0,-0.245661,0.0
4,2020-02-12,4140988305631444517,822862001,0.050831,2,104,0.0,999.0,822862,15472,...,1,26,3387,2,0.050831,0.050831,1.0,31.0,0.0,0.0


# Train(CV)

In [30]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

valid.sort_values(['week', 'customer_id'], inplace=True)
valid.reset_index(drop=True, inplace=True)

In [31]:
train = data[data.week < valid_week]

if POSITIVE:
    ids = train[train.purchased==1].customer_id.unique()
    train = train[train['customer_id'].isin(ids)]

In [32]:
print(train.shape, valid.shape)

(51924797, 46) (1583312, 46)


In [33]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
valid_baskets = valid.groupby(['week', 'customer_id'])['article_id'].count().values

In [34]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'bestseller_rank']

categorical_feature =  ['product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency']

if SEASON:
    columns_to_use += ['month', 'season']
if FEATURE:
#     columns_to_use += ['user_article_count']
    columns_to_use += ['agg_price_mean', 'agg_price_max', 'agg_price_count', 'user_article_count', 'max_price_diff', 'week_diff']
#     columns_to_use += ['agg_price_mean', 'agg_price_max', 'agg_price_count', 'max_price_diff', 'week_diff']

In [35]:
train_X = train[columns_to_use]
train_y = train['purchased']
valid_X = valid[columns_to_use]
valid_y = valid['purchased']

# test_X = test[columns_to_use]

In [36]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="map",
    map_eval_at=12,
    boosting_type="gbdt",
    max_depth=7,
    n_estimators=300,
    importance_type='gain',
    verbose=10
)

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
    categorical_feature=categorical_feature
)



[LightGBM] [Info] Total groups: 2239716, total data: 51924797
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.863112
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.154350
[LightGBM] [Debug] init for col-wise cost 1.394825 seconds, init for row-wise cost 1.726879 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2028
[LightGBM] [Info] Number of data points in the train set: 51924797, number of used features: 25
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and dept

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Traine

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Traine

In [37]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.48374490596615777
month 0.12809859882280883
week_diff 0.11628779636361214
product_type_no 0.11603033313841794
department_no 0.0577693360580722
user_article_count 0.03502273446545924
colour_group_code 0.024385104896382945
article_id 0.01515044740938273
graphical_appearance_no 0.010240549245696786
max_price_diff 0.006061306361127401
perceived_colour_master_id 0.003714908566640011
season 0.0017004022053550853
age 0.0008625721976426222
agg_price_count 0.0004971749487551535
section_no 0.00018821368372672645
garment_group_no 0.00013082206070075443
perceived_colour_value_id 0.00011264233985964865
agg_price_mean 2.1512702020141956e-06
agg_price_max 0.0
index_group_no 0.0
index_code 0.0
FN 0.0
club_member_status 0.0
fashion_news_frequency 0.0
Active 0.0


# valid Inference

In [38]:
valid['pred'] = ranker.predict(valid_X)
preds = valid[['customer_id', 'article_id', 'pred']].copy()
preds.sort_values(['customer_id', 'pred'], ascending=False, inplace = True)
preds.head()

Unnamed: 0,customer_id,article_id,pred
1583309,18446737527580148316,896169005,9.207243
1583308,18446737527580148316,893141002,8.16649
1583304,18446737527580148316,761406001,6.62114
1583310,18446737527580148316,915526001,5.232112
1583300,18446737527580148316,762846027,3.889758


In [39]:
preds.customer_id

1583309    18446737527580148316
1583308    18446737527580148316
1583304    18446737527580148316
1583310    18446737527580148316
1583300    18446737527580148316
                   ...         
17             1402273113592184
8              1402273113592184
1              1402273113592184
0              1402273113592184
12             1402273113592184
Name: customer_id, Length: 1583312, dtype: uint64

In [40]:
candidates_bestsellers.week.unique()

array([ 83,  88,  76,  92,  96, 102,  75,  81,  85,  86,  91,  94, 103,
        90,  82,  99, 100,  78,  84,  93,  79,  89,  97, 104,  95, 101,
        74,  98,  77,  87,  80, 105])

In [41]:
# tmp = candidates_bestsellers[candidates_bestsellers['week']==104]
# tmp[tmp['customer_id']==1827730561464445]

In [42]:
# candidates_season_sales[(candidates_season_sales['article_id']==761406001) & (candidates_season_sales['week']!=104)]

In [43]:
# candidates_last_purchase[candidates_last_purchase['article_id']==761406001]

In [44]:
# train[train['article_id']==893141002][['purchased', 'week']].purchased.mean()

In [45]:
# transactions2 = pd.read_parquet(os.path.join(OUTPUT_DIR, 'transactions_train.parquet'))
# # transactions2 = transactions2[transactions2.week > transactions2.week.max() - 10]
# transactions2[transactions2['article_id']==761406001][['week', 'article_id']].groupby(['week', 'article_id'])[['article_id']].count()

In [46]:
# valid[valid['customer_id']==1951136007097426][['customer_id', 'article_id', 'pred']]

In [47]:
# articles2 = pd.read_csv(os.path.join(INPUT_DIR, 'articles.csv'), dtype={"article_id": "str"})
# articles2[articles2['article_id']=='0918525001']

In [48]:
pred_unq = preds.groupby('customer_id')['article_id'].apply(list).reset_index()
pred_unq['valid_pred'] = pred_unq['article_id'].map(lambda x: '0'+' 0'.join(str(x)[1:-1].split(', ')))
pred_unq.head()

Unnamed: 0,customer_id,article_id,valid_pred
0,1402273113592184,"[896169005, 893141002, 761406001, 915526001, 7...",0896169005 0893141002 0761406001 0915526001 07...
1,1827730561464445,"[894668002, 547780003, 760084006, 894668003, 7...",0894668002 0547780003 0760084006 0894668003 07...
2,1951136007097426,"[896169005, 893141002, 761406001, 915526001, 7...",0896169005 0893141002 0761406001 0915526001 07...
3,2639747769247776,"[865929003, 889870001, 893141002, 896169005, 7...",0865929003 0889870001 0893141002 0896169005 07...
4,3177658828628418,"[896169005, 893141002, 761406001, 915526001, 7...",0896169005 0893141002 0761406001 0915526001 07...


In [49]:
true_unq = valid_true.groupby('customer_id')['article_id'].apply(list).reset_index()
true_unq['valid_true'] = true_unq['article_id'].map(lambda x: '0'+' 0'.join(str(x)[1:-1].split(', ')))
true_unq.head()

Unnamed: 0,customer_id,article_id,valid_true
0,1402273113592184,"[885951001, 611415001]",0885951001 0611415001
1,1827730561464445,"[918603001, 921380001]",0918603001 0921380001
2,1951136007097426,[778745010],0778745010
3,2639747769247776,[819547001],0819547001
4,3177658828628418,"[869331006, 866731001]",0869331006 0866731001


In [50]:
merged = pd.merge(true_unq, pred_unq, on='customer_id', how='left').fillna('')

del merged['article_id_x'], merged['article_id_y']
merged.head()

Unnamed: 0,customer_id,valid_true,valid_pred
0,1402273113592184,0885951001 0611415001,0896169005 0893141002 0761406001 0915526001 07...
1,1827730561464445,0918603001 0921380001,0894668002 0547780003 0760084006 0894668003 07...
2,1951136007097426,0778745010,0896169005 0893141002 0761406001 0915526001 07...
3,2639747769247776,0819547001,0865929003 0889870001 0893141002 0896169005 07...
4,3177658828628418,0869331006 0866731001,0896169005 0893141002 0761406001 0915526001 07...


In [51]:
i = 3421
print(merged.loc[i].valid_true.split(' '))
print(set(merged.loc[i].valid_true.split(' ')) & set(merged.loc[i].valid_pred.split(' ')))

['0898684001']
set()


In [52]:
merged.loc[i].valid_pred

'0859125001 0858147006 0819113001 0894085001 0885951001 0851010008 0821397008 0700758017 0893141002 0896169005 0761406001 0915526001 0915529003 0863583001 0863595006 0924243001 0865799006 0918522001 0909370001 0751471001 0762846027 0809238005 0918292001 0706016001 0448509014 0863646001 0809238001 0673677002'

In [53]:
# tmp = train[(train.week!=104) & (train.purchased==1)]
# tmp[tmp['article_id']==918603001]

In [54]:
tqdm.pandas()

mapk(
    merged['valid_true'].map(lambda x: x.split()), 
    merged['valid_pred'].map(lambda x: x.split()), 
    k=12
)

0.03060168691107423

# Train

In [55]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [56]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [57]:
del valid, train, data, candidates_last_purchase, candidates_bestsellers

In [58]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="map",
    map_eval_at=12,
    boosting_type="gbdt",
    max_depth=7,
    n_estimators=300,
    importance_type='gain',
    verbose=10
)

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)



[LightGBM] [Info] Total groups: 2308700, total data: 53708182
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.861823
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153278
[LightGBM] [Debug] init for col-wise cost 1.144761 seconds, init for row-wise cost 1.489514 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1966
[LightGBM] [Info] Number of data points in the train set: 53708182, number of used features: 25
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and dept

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Traine

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Traine

In [59]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.4833273414845384
month 0.13931432569604105
season 0.1383501992243679
week_diff 0.11282859707080702
article_id 0.03702383266919957
user_article_count 0.0347729056470738
department_no 0.012784193038530843
garment_group_no 0.00738359896987632
max_price_diff 0.00724545981876552
colour_group_code 0.006379108525021797
section_no 0.005655685631504766
graphical_appearance_no 0.005245551980328391
perceived_colour_master_id 0.005194465754848331
product_type_no 0.0012395287700992118
perceived_colour_value_id 0.0011895940073667058
index_code 0.0010407460000014415
agg_price_count 0.0005390152646222332
age 0.00038104248147279306
index_group_no 0.00010269769769828608
agg_price_max 2.110267835642976e-06
agg_price_mean 0.0
FN 0.0
club_member_status 0.0
fashion_news_frequency 0.0
Active 0.0


# Inference

In [60]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

# 余り用
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [61]:
sub = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))

In [62]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [63]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [64]:
import datetime
now = datetime.datetime.now()
now = now.strftime("%m%d_%H%M")

sub.to_csv(os.path.join(OUTPUT_DIR, f'predictions/submisssion_ranking_{now}.csv'), index = False)