base on: https://www.kaggle.com/code/hengzheng/time-is-our-best-friend-v2/notebook

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta

In [None]:
data_path = Path('/kaggle/input/h-and-m-personalized-fashion-recommendations/')

transactions = pd.read_csv(
    data_path / 'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': str} 
)
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# submission = pd.read_csv(data_path / 'sample_submission.csv')

In [None]:
transactions.head()

In [None]:
submission = pd.DataFrame()
submission['customer_id'] = transactions['customer_id'].unique()
submission['prediction'] = None

In [None]:
print(transactions.shape)
transactions.head()

In [None]:
print(submission.shape)
submission.head()

In [None]:
transactions['t_dat'].max()

## split train and valid

In [None]:
cur_date = str(transactions['t_dat'].max())

one_week = timedelta(days=7)
last_week = str(datetime.strptime(cur_date, '%Y-%m-%d %H:%M:%S') - one_week)[:10]
last_week

In [None]:
# the last week as valid, the other as train
train_transactions = transactions.loc[ transactions.t_dat <= pd.to_datetime(last_week) ].copy()
valid_transactions = transactions.loc[ transactions.t_dat > pd.to_datetime(last_week) ].copy()

In [None]:
print('train data time:')
print(train_transactions['t_dat'].min(), train_transactions['t_dat'].max())

print('valid data time:')
print(valid_transactions['t_dat'].min(), valid_transactions['t_dat'].max())

In [None]:
valid_transactions.head()

In [None]:
solution = valid_transactions.groupby('customer_id').article_id.apply(list).reset_index()
solution = solution.rename({'article_id':'prediction'},axis=1)
solution['prediction'] = solution.prediction.apply(lambda x: ' '.join(x))

In [None]:
solution.head()

In [None]:
del valid_transactions
transactions = train_transactions

In [None]:
transactions = transactions.sample(frac = 0.2)

## recall + ranking. [ref](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/307288)

In [None]:
transactions.head(3)

In [None]:
submission.head(3)

## recall

In [None]:
transactions['t_dat'].max()

In [None]:
df_3w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-24')].copy()
df_2w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-08-31')].copy()
df_1w = transactions[transactions['t_dat'] >= pd.to_datetime('2020-09-07')].copy()

In [None]:
purchase_dict_3w = {}
for i,x in enumerate(zip(df_3w['customer_id'], df_3w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_3w:
        purchase_dict_3w[cust_id] = {}
    
    if art_id not in purchase_dict_3w[cust_id]:
        purchase_dict_3w[cust_id][art_id] = 0
    
    purchase_dict_3w[cust_id][art_id] += 1
dummy_list_3w = list((df_3w['article_id'].value_counts()).index)[:12]


purchase_dict_2w = {}
for i,x in enumerate(zip(df_2w['customer_id'], df_2w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_2w:
        purchase_dict_2w[cust_id] = {}
    
    if art_id not in purchase_dict_2w[cust_id]:
        purchase_dict_2w[cust_id][art_id] = 0
    
    purchase_dict_2w[cust_id][art_id] += 1
dummy_list_2w = list((df_2w['article_id'].value_counts()).index)[:12]


purchase_dict_1w = {}
for i,x in enumerate(zip(df_1w['customer_id'], df_1w['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_1w:
        purchase_dict_1w[cust_id] = {}
    
    if art_id not in purchase_dict_1w[cust_id]:
        purchase_dict_1w[cust_id][art_id] = 0
    
    purchase_dict_1w[cust_id][art_id] += 1
dummy_list_1w = list((df_1w['article_id'].value_counts()).index)[:12]

In [None]:
from tqdm import tqdm, tqdm_notebook
def prepare_candidates(customers_id, n_candidates = 12):
  """
  df - basically, dataframe with customers(customers should be unique)
  """
  prediction_dict = {}
  dummy_list = dummy_list_1w

  for i, cust_id in tqdm(enumerate(customers_id)):
    # comment this for validation
    if cust_id in purchase_dict_1w:
        l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_1w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_2w:
        l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_2w[:(n_candidates-len(l))]
    elif cust_id in purchase_dict_3w:
        l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_3w[:(n_candidates-len(l))]
    else:
        s = dummy_list
    prediction_dict[cust_id] = s

  k = list(map(lambda x: x[0], prediction_dict.items()))
  v = list(map(lambda x: x[1], prediction_dict.items()))
  negatives_df = pd.DataFrame({'customer_id': k, 'negatives': v})
  negatives_df = (
      negatives_df
      .explode('negatives')
      .rename(columns = {'negatives': 'article_id'})
  )
  return negatives_df

## prepare data

### pos

In [None]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)
transactions['rank'] = range(len(transactions))

#take only last 15 transactions
transactions = (
    transactions
    .assign(
        rn = transactions.groupby(['customer_id'])['rank']
                  .rank(method='first', ascending=False))
    .query("rn <= 15")
    .drop(columns = ['price', 'sales_channel_id'])
    .sort_values(['t_dat', 'customer_id'])
)
transactions['label'] = 1

del transactions['rank']
del transactions['rn']

### neg

In [None]:
last_dates = (
    transactions
    .groupby('customer_id')['t_dat']
    .max()
    .to_dict()
)

negatives = prepare_candidates(transactions['customer_id'].unique(), 15)
negatives['t_dat'] = negatives['customer_id'].map(last_dates)

negatives['label'] = 0

### concat

In [None]:
train = pd.concat([transactions, negatives])
train.sort_values(['customer_id', 't_dat'], inplace = True)

In [None]:
# 先对完全一致的样本去重:
train = train.drop_duplicates()

# 同时出现标签为1和0的情形, 表示召回样本为真实购买, 出现删除标签为0的样本:
train = train.drop_duplicates(['customer_id', 'article_id', 't_dat'], keep = 'first')

train.index = range(len(train))

## 合并其他特征

In [None]:
user_features = pd.read_parquet('../input/ranking-features/user_features.parquet')
user_features[['club_member_status', 'fashion_news_frequency']] = (
                   user_features[['club_member_status', 'fashion_news_frequency']]
                   .apply(lambda x: pd.factorize(x)[0])
).astype('int8')

item_features = pd.read_parquet('../input/ranking-features/item_features.parquet')

In [None]:
item_features = item_features.reset_index()
user_features = user_features.reset_index()

In [None]:
item_features['article_id'] = item_features['article_id'].astype(int).astype(str)
item_features['article_id'] = item_features['article_id'].apply(lambda x: '0' + x)

In [None]:
train = (
    train
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)

## fit model

In [None]:
from lightgbm.sklearn import LGBMRanker

In [None]:
train_baskets = train.groupby(['customer_id'])['article_id'].count().values

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    max_depth=7,
    n_estimators=300,
    importance_type='gain',
    verbose=10
)

ranker = ranker.fit(
    train.drop(columns = ['t_dat', 'customer_id', 'article_id', 'label']),
    train.pop('label'),
    group=train_baskets,
)

## predict

In [None]:
candidates = prepare_candidates(submission.customer_id.unique(), 12)
candidates = (
    candidates
    .merge(user_features, on = ('customer_id'))
    .merge(item_features, on = ('article_id'))
)

preds = []
batch_size = 1_000_000
for bucket in tqdm(range(0, len(candidates), batch_size)):
  outputs = ranker.predict(
      candidates.iloc[bucket: bucket+batch_size]
      .drop(columns = ['customer_id', 'article_id'])
      )
  preds.append(outputs)

preds = np.concatenate(preds)
candidates['preds'] = preds
preds = candidates[['customer_id', 'article_id', 'preds']]
preds.sort_values(['customer_id', 'preds'], ascending=False, inplace = True)
preds = (
    preds
    .groupby('customer_id')[['article_id']]
    .aggregate(lambda x: x.tolist())
)


### recall1: Recommend Most Often Previously Purchased Items

In [None]:
# tmp = transactions.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
# tmp.columns = ['customer_id','article_id','cnt']

# recall1 = transactions.copy()
# recall1 = recall1.merge(tmp, on = ['customer_id','article_id'], how='left')
# recall1 = recall1.sort_values(['cnt','t_dat'],ascending=False)
# recall1 = recall1.drop_duplicates(['customer_id','article_id'])
# recall1.index = range(len(recall1))
# recall1 = recall1[['customer_id', 'article_id', 'cnt']]

In [None]:
# recall1.head()

### recall2: Recommend Items Purchased Together. [ref](https://www.kaggle.com/code/poteman/customers-who-bought-this-frequently-buy-this?scriptVersionId=93072545)

In [None]:
# pairs = np.load('../input/hmitempairs/pairs_cudf.npy',allow_pickle=True).item()

In [None]:
# recall2 = transactions[['customer_id', 'article_id']].copy()
# recall2['recommend'] = recall2['article_id'].astype(int).map(pairs)
# recall2 = recall2[['customer_id', 'recommend']]
# recall2.columns = ['customer_id', 'article_id']
# recall2 = recall2.loc[recall2['article_id'].notnull()]
# recall2 = recall2.drop_duplicates(['customer_id','article_id'])
# recall2.index = range(len(recall2))
# recall2['article_id'] = recall2['article_id'].astype(int).astype(str)
# recall2['article_id'] = recall2['article_id'].apply(lambda x: '0' + x)

In [None]:
# recall2.head()

### recall3: Recommend Last Week's Most Popular Items. [ref](https://www.kaggle.com/code/cdeotte/recommend-items-purchased-together-0-021/notebook)

In [None]:
# cur_date = str(transactions['t_dat'].max())

# one_week = timedelta(days=7)
# last_week = str(datetime.strptime(cur_date, '%Y-%m-%d %H:%M:%S') - one_week)[:10]
# last_week

In [None]:
# print(transactions.loc[transactions['t_dat'] > pd.to_datetime(last_week), 't_dat'].min())
# print(transactions.loc[transactions['t_dat'] > pd.to_datetime(last_week), 't_dat'].max())

In [None]:
# recall3 = list(transactions[transactions['t_dat'] > pd.to_datetime(last_week)].article_id.value_counts().index[:12])   # last 1 week

In [None]:
# recall3

## ranking. [ref](https://www.kaggle.com/code/alexvishnevskiy/gbm-ranking)

# local map@12

In [None]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
solution.columns = ['customer_id', 'true']

In [None]:
merged = pd.merge(solution, not_so_fancy_but_fast_benchmark, on='customer_id', how='left')

In [None]:
print(merged.shape)
merged.head()

In [None]:
mapk(
    merged['true'].map(lambda x: str(x).split()), 
    merged['prediction'].map(lambda x: str(x).split()), 
    k=12
)