In [None]:
import pandas as pd

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [None]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/warmup/customers.parquet')
articles = pd.read_parquet('../input/warmup/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

# adding cv_Data

In [None]:
cv_data = pd.read_csv('../input/ddddaaaa/checkpoint1.csv')
cv_data = cv_data.drop('Unnamed: 0', axis=1).rename(columns={"0": "article_id"}).astype({'article_id': 'int32', '1': 'float16', '2': 'float16', '3': 'float16', '4': 'float16', '5': 'float16', '6': 'float16',
               '7': 'float16', '8': 'float16', '9': 'float16', '10': 'float16', '11': 'float16', '12': 'float16',
               '13': 'float16', '14': 'float16', '15': 'float16', '16': 'float16', '17': 'float16', '18': 'float16',
               '19': 'float16', '20': 'float16'})

cv_data_2 = pd.read_csv('../input/aaaaad/checkpoint2.csv')
cv_data_2 = cv_data_2.drop('Unnamed: 0', axis=1).rename(columns={"0": "article_id", '1': '21', '2': '22', '3': '23', '4': '24',
                                                              '5': '25', '6': '26', '7': '27', '8': '28', '9': '29', '10': '30',
                                                              '11': '31', '12': '32', '13': '33', '14': '34', '15': '35', '16': '36',
                                                              '17': '37', '18': '38', '19': '39', '20': '40'})\
.astype({'article_id': 'int32', '21': 'float16', '22': 'float16', '23': 'float16', '24': 'float16', '25': 'float16', '26': 'float16',
               '27': 'float16', '28': 'float16', '29': 'float16', '30': 'float16', '31': 'float16', '32': 'float16',
               '33': 'float16', '34': 'float16', '35': 'float16', '36': 'float16', '37': 'float16', '38': 'float16',
               '39': 'float16', '40': 'float16'})


# cv_data_3 = pd.read_csv('../input/sssssee/checkpoint3.csv')
# cv_data_3 = cv_data_3.drop('Unnamed: 0', axis=1).rename(columns={"0": "article_id", '1': '41', '2': '42', '3': '43', '4': '44',
#                                                               '5': '45', '6': '46', '7': '47', '8': '48', '9': '49', '10': '50',
#                                                               '11': '51', '12': '52', '13': '53', '14': '54', '15': '55', '16': '56',
#                                                               '17': '57', '18': '58', '19': '59', '20': '60'})\
# .astype({'article_id': 'int32', '41': 'float16', '42': 'float16', '43': 'float16', '44': 'float16', '45': 'float16', '46': 'float16',
#                '47': 'float16', '48': 'float16', '49': 'float16', '50': 'float16', '51': 'float16', '52': 'float16',
#                '53': 'float16', '54': 'float16', '55': 'float16', '56': 'float16', '57': 'float16', '58': 'float16',
#                '59': 'float16', '60': 'float16'})

# cv_data

In [None]:
articles = pd.merge(articles, cv_data, on='article_id', how='left').fillna(0)
articles = pd.merge(articles, cv_data_2, on='article_id', how='left').fillna(0)
# articles = pd.merge(articles, cv_data_3, on='article_id', how='left').fillna(0)

In [None]:
# articles

In [None]:
del cv_data

# continue

In [None]:
transactions.week.max()

In [None]:
# weeks: 49,50,51, 99,100,101
tr_fold_1 = pd.concat([transactions[(transactions.week.max() - 52 > transactions.week) & (transactions.week > transactions.week.max() - 56)], 
         transactions[(transactions.week.max() - 2 > transactions.week) & (transactions.week > transactions.week.max() - 6)]])
tr_fold_1_targ = transactions[transactions.week == 102]

# weeks: 50,51,52, 100,101,102
tr_fold_2 = pd.concat([transactions[(transactions.week.max() - 51 > transactions.week) & (transactions.week > transactions.week.max() - 55)], 
         transactions[(transactions.week.max() - 1 > transactions.week) & (transactions.week > transactions.week.max() - 5)]])
tr_fold_2_targ = transactions[transactions.week == 103]

# weeks: 51,52,53, 101,102,103
tr_fold_3 = pd.concat([transactions[(transactions.week.max() - 50 > transactions.week) & (transactions.week > transactions.week.max() - 54)], 
         transactions[(transactions.week.max() > transactions.week) & (transactions.week > transactions.week.max() - 4)]])
tr_fold_3_targ = transactions[transactions.week == 104]

# weeks: 52,53,54, 102,103,104
tr_last = pd.concat([transactions[(transactions.week.max() - 49 > transactions.week) & (transactions.week > transactions.week.max() - 53)], 
         transactions[(transactions.week.max() + 1 > transactions.week) & (transactions.week > transactions.week.max() - 3)]])
tr_last_targ = transactions[transactions.week == 105]

In [None]:
test_week = transactions.week.max() + 1
# transactions = transactions[transactions.week > transactions.week.max() - 10]

In [None]:
del transactions

In [None]:
test_week

# Generating candidates

### Last purchase candidates

In [None]:
%%time

c2weeks = tr_fold_3.groupby('customer_id')['week'].unique()
c2weeks_last = tr_last.groupby('customer_id')['week'].unique()

In [None]:
# c2weeks

In [None]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = 104
    

c2weeks2shifted_weeks_last = {}

for c_id, weeks in c2weeks_last.items():
    c2weeks2shifted_weeks_last[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks_last[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks_last[c_id][weeks[-1]] = 105

In [None]:
candidates_last_purchase = tr_fold_3.copy()
candidates_last_purchase_last = tr_last.copy()

In [None]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(tr_fold_3['customer_id'], tr_fold_3['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks


weeks = []
for i, (c_id, week) in enumerate(zip(tr_last['customer_id'], tr_last['week'])):
    weeks.append(c2weeks2shifted_weeks_last[c_id][week])
    
candidates_last_purchase_last.week=weeks

In [None]:
# candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

In [None]:
# tr_fold_3[tr_fold_3['customer_id']==272412481300040]

### Bestsellers candidates

In [None]:
mean_price = tr_fold_3 \
    .groupby(['week', 'article_id'])['price'].mean()

mean_price_last = tr_last \
    .groupby(['week', 'article_id'])['price'].mean()

In [None]:
# mean_price

In [None]:
sales = tr_fold_3 \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

sales_last = tr_last \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [None]:
# sales

In [None]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

bestsellers_previous_week_last = pd.merge(sales_last, mean_price_last, on=['week', 'article_id']).reset_index()
bestsellers_previous_week_last.week += 1

In [None]:
unique_transactions = tr_fold_3 \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()


unique_transactions_last = tr_last \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [None]:
# unique_transactions

In [None]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

candidates_bestsellers_last = pd.merge(
    unique_transactions_last,
    bestsellers_previous_week_last,
    on='week',
)

In [None]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = 104


test_set_transactions_last = unique_transactions_last.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = 105

In [None]:
# test_set_transactions

In [None]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

candidates_bestsellers_test_week_last = pd.merge(
    test_set_transactions_last,
    bestsellers_previous_week_last,
    on='week'
)

In [None]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

candidates_bestsellers_last = pd.concat([candidates_bestsellers_last, candidates_bestsellers_test_week_last])
candidates_bestsellers_last.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

In [None]:
tr_fold_3['purchased'] = 1
tr_last['purchased'] = 1

tr_fold_3_targ['purchased'] = 1

In [None]:
data = pd.concat([tr_fold_3, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
 
data_last = pd.concat([tr_last, candidates_last_purchase_last, candidates_bestsellers_last])
data_last.purchased.fillna(0, inplace=True)

In [None]:
# del transactions
# del candidates_last_purchase
# del candidates_bestsellers

In [None]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

data_last.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [None]:
# val_data
# tr_fold_3_targ

In [None]:
# data

### Add bestseller information

In [None]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

data_last = pd.merge(
    data_last,
    bestsellers_previous_week_last[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [None]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)


data_last = data_last[data_last.week != data_last.week.min()]
data_last.bestseller_rank.fillna(999, inplace=True)

In [None]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

data_last = pd.merge(data_last, articles, on='article_id', how='left')
data_last = pd.merge(data_last, customers, on='customer_id', how='left')

In [None]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

data_last.sort_values(['week', 'customer_id'], inplace=True)
data_last.reset_index(drop=True, inplace=True)

# continue

In [None]:
train = data[data.week != 104]
test = data[data.week==104].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

train_last = data_last[data_last.week != 105]
test_last = data_last[data_last.week==105].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [None]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
test_baskets = test.groupby(['week', 'customer_id'])['article_id'].count().values
test_baskets = [list(test_baskets)]



train_baskets_last = train_last.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
# del data

In [None]:
import numpy as np

s = ['week']
# s = []
for i in np.arange(1,41):
    s.append(str(i))
    
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank'] + s

In [None]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

X_test_last = test_last[columns_to_use]

In [None]:
test_y = pd.merge(test.drop('purchased', axis=1), tr_fold_3_targ[['article_id', 'customer_id', 'purchased']], on=['article_id', 'customer_id'], how='left')
test_y.purchased.fillna(0, inplace=True)

test_y.sort_values("purchased", inplace = True)
test_y.drop_duplicates(['customer_id', 'article_id', 'sales_channel_id'], keep='last', inplace=True)
print(test_y['purchased'].value_counts())

In [None]:
X_TEST = test_y[columns_to_use]
Y_TEST = test_y['purchased']

In [None]:
X_TEST

# Model training

In [None]:
X_test_last

In [None]:
from lightgbm.sklearn import LGBMRanker

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="map",
    boosting_type="dart",
    n_estimators=10,
    importance_type='gain',
    verbose=10
)

In [None]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
    eval_set = [(X_TEST, Y_TEST.values)],
    eval_group = test_baskets,
    eval_metric = 'map'
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

# Calculate predictions

In [None]:
# X_test_last

In [None]:
%time

test_last['preds'] = ranker.predict(X_test_last)

c_id2predicted_article_ids = test_last \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week_last[bestsellers_previous_week_last.week == bestsellers_previous_week_last.week.max()]['article_id'].tolist()

# submission

In [None]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv', index=False)