In [None]:
import sys
sys.path.append('..')
sys.path.append('/Dataset')
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from DressipiChallenge.Pipeline.xgboost.xgboost_tuning import XGB_hypertune, Real, Categorical, Integer
from DressipiChallenge.Pipeline.xgboost.xgboost_utils import XGB_train, XGB_tune_test
from DressipiChallenge.Pipeline.gradient_boosting_utils import fit_models, XGB_rerank, load_xgboost_train_df, load_xgboost_test_df, load_attributes, XGB_insert_session_feature, create_submission_XGB,
from DressipiChallenge.Pipeline.utils import create_mapping, get_mapped_sessions_to_recommend, get_items_to_exclude
from DressipiChallenge.Pipeline.matrices_creation import create_URM
from DressipiChallenge.Pipeline.data_splitting import train_val_split
from DressipiChallenge.Pipeline.data_extraction import get_dataframes
from DressipiChallenge.Recommenders.NonPersonalizedRecommender import TopPop
from DressipiChallenge.Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from DressipiChallenge.Recommenders.GraphBased.P3alphaRecommender import P3alphaRecommender

In [None]:
item_features_df, train_sessions_df, train_purchases_df, test_sessions_df, candidate_items_df = get_dataframes()

model_classes = [P3alphaRecommender, TopPop, ItemKNNCFRecommender]

models_hyp = [{'topK': 479, 'alpha': 1.1764856470188576, 'normalize_similarity': True}, {}, {'shrink': 500, 'similarity': 'asymmetric', 'feature_weighting': 'none', 'topK': 495, 'normalize': True}]

config_dict = {'models_hyp': models_hyp, 'is_content_based': [False, False, False], 'model_classes' : model_classes}

num_boost_round = 5

xgb_model, features_to_drop, reranked_df = XGB_tune_test(
    item_features_df=item_features_df,
    config_dict=config_dict,
    train_purchases_df=train_purchases_df,
    num_boost_round=num_boost_round,
    train_sessions_df=train_sessions_df,
    num_trials = 1,
    num_folds = 2,
)

In [None]:
def print_if_false(x):
    if not any(x.target.values):
        print(x.session_id.values[0]) 

reranked_df.groupby('session_id').apply(
    lambda x: print_if_false(x)
    )


In [None]:
reranked_df[(reranked_df.session_id == 1) & (reranked_df.target == True)]

In [None]:
# TRAINING

item_features_df, train_sessions_df, train_purchases_df, test_sessions_df, candidate_items_df = get_dataframes()

train_views_purch_df, val_purch_df = train_val_split(train_sessions_df, train_purchases_df,
                                           n_sets=1,
                                           ts_start='2021-05-01', ts_end='2021-06-01',
                                           return_discarded=False)
# create mapping
item_mapping = create_mapping(item_features_df['item_id'])

train_session_mapping = create_mapping(train_views_purch_df['session_id'])

val_session_mapping = create_mapping(val_purch_df['session_id'])

val_sessions_arr = get_mapped_sessions_to_recommend(
    val_purch_df, val_session_mapping)

candidates_val_ids = np.unique(val_purch_df['item_id'].values)
items_to_ignore_val = get_items_to_exclude(item_features_df, candidates_val_ids)
mapped_items_to_ignore_val = [item_mapping[elem] for elem in items_to_ignore_val]

val_purch_df['session_id'] = val_purch_df['session_id'].map(val_session_mapping)
val_purch_df['item_id'] = val_purch_df['item_id'].map(item_mapping)

val_views_df = train_sessions_df[
    (train_sessions_df.date >= '2021-05-01') & (train_sessions_df.date < '2021-06-01')][['session_id', 'item_id', 'date']]

'''
# create_URM does the mapping
val_views_df['session_id'] = val_views_df['session_id'].map(val_session_mapping)
val_views_df['item_id'] = val_views_df['item_id'].map(item_mapping)
'''

In [None]:
candidates_val_ids

In [None]:
val_purch_df.sort_values('date')

In [None]:
train_views_purch_df.sort_values('date')

In [None]:

candidates_df_path = "./Dataset/xgb_candidates/candidates_train_df.parquet"

models = []
models_hyp = []

if not os.path.exists(candidates_df_path):
    # create URM_train
    URM_train = create_URM(train_views_purch_df, train_session_mapping, item_mapping)
    URM_val = create_URM(val_views_df, val_session_mapping, item_mapping)

    # define pre optimized models and best hyperparameters
    models.append(P3alphaRecommender(URM_train))
    models_hyp.append(
        {'topK': 479, 'alpha': 1.1764856470188576, 'normalize_similarity': True})

    models.append(TopPop(URM_train))
    models_hyp.append({})

    models.append(ItemKNNCFRecommender(URM_train))
    models_hyp.append(
        {'shrink': 500, 'similarity': 'asymmetric', 'feature_weighting': 'none', 'topK': 495, 'normalize': True})

    # fit models on URM_train
    fit_models(models, models_hyp, mapped_items_to_ignore_val)

    for model in models:
        model.set_URM_train(URM_val)

# generate candidates
candidates_df = load_xgboost_train_df(session_ids=val_sessions_arr, val_purchases=val_purch_df, models=models,
                                      cutoff=100)

In [None]:
candidates_df

In [None]:
# add feature columns
session_attributes_train_df, item_attributes_df = load_attributes(train_session_mapping=val_session_mapping,
                                                                          item_mapping=item_mapping)
                                                                          
session_attributes_train_df

In [None]:
# candidates_df = XGB_insert_item_feature(candidates_df, item_attributes_df)
candidates_df = XGB_insert_session_feature(candidates_df, session_attributes_train_df)
candidates_df

In [None]:
candidates_df[(candidates_df.session_id == 0) & (candidates_df.item_id == 9556)]

In [None]:
target_df = candidates_df[['target']]
candidates_df = candidates_df.drop(columns='target')

xgb_hyperparams, iteration = XGB_hypertune(
    candidates_df=candidates_df, target_df=target_df, num_trials=1)

In [None]:
xgb_model = XGB_train(candidates_df=candidates_df, target_df=target_df, xgb_hyperparams=xgb_hyperparams,
                      num_boost_round=iteration)

In [None]:
feature_importance = xgb_model.get_score()
print("Feature importance: " + str(feature_importance))

# Remove useless columns and retrain
cols_to_keep = list(feature_importance.keys())
cols_to_keep.extend(['session_id', 'item_id'])

to_drop = [col for col in candidates_df.columns.to_list() if col not in cols_to_keep]
candidates_df = candidates_df.drop(columns=to_drop)

In [None]:
xgb_model = XGB_train(candidates_df=candidates_df, target_df=target_df, xgb_hyperparams=xgb_hyperparams,
                      num_boost_round=iteration)

In [None]:
reranked_df = XGB_rerank(candidates_df=candidates_df, xgb_model=xgb_model, cutoff=100)
reranked_df.to_parquet('./Dataset/xgb_candidates/reranked_df.parquet')
reranked_df

In [None]:
reranked_df = pd.read_parquet('./Dataset/xgb_candidates/reranked_df.parquet')

In [None]:
reranked_df[reranked_df.session_id == 0]

In [None]:
# GENERATE SUBMISSION

train_set_df = pd.concat([train_sessions_df, train_purchases_df])
train_set_df.sort_values(by=['session_id', 'date'], inplace=True)
train_set_df.reset_index(drop=True, inplace= True)

train_session_mapping = create_mapping(train_set_df['session_id'])

test_set_df = test_sessions_df
test_session_mapping = create_mapping(test_set_df['session_id'])

test_sessions_arr = get_mapped_sessions_to_recommend(test_set_df, test_session_mapping)

candidates_val_ids = candidate_items_df['item_id'].values
items_to_ignore = get_items_to_exclude(item_features_df, candidates_val_ids)
mapped_items_to_ignore = [item_mapping[elem] for elem in items_to_ignore]

candidates_df_path = "./Dataset/xgb_candidates/candidates_test_df.parquet"

In [None]:
train_set_df

In [None]:
test_set_df

In [None]:

models = []
models_hyp = []

if not os.path.exists(candidates_df_path):

    URM_all = create_URM(train_set_df, train_session_mapping, item_mapping)
    URM_test = create_URM(test_set_df, test_session_mapping, item_mapping)

    models.append(P3alphaRecommender(URM_all))
    models_hyp.append(
        {'topK': 479, 'alpha': 1.1764856470188576, 'normalize_similarity': True})

    models.append(TopPop(URM_all))
    models_hyp.append({})

    models.append(ItemKNNCFRecommender(URM_all))
    models_hyp.append(
        {'shrink': 500, 'similarity': 'asymmetric', 'feature_weighting': 'none', 'topK': 495, 'normalize': True})

    fit_models(models, models_hyp, mapped_items_to_ignore)

    for model in models:
        model.set_URM_train(URM_test)
        # WARNING: only works for models with an item-item similarity matrix

sub_candidates_df = load_xgboost_test_df(session_ids=test_sessions_arr, models=models, cutoff=100)

In [None]:
sub_candidates_df

In [None]:
# generate predictions with pretrained XGBoost model
xgb_model = xgb.Booster()
xgb_model.load_model("./Dataset/xgb_model/model.json")

In [None]:
# add feature columns
session_attributes_test_df = load_attributes(test_session_mapping=test_session_mapping)
session_attributes_test_df

In [None]:
sub_candidates_df = XGB_insert_session_feature(
    sub_candidates_df, session_attributes_test_df)


In [None]:
sub_candidates_df = sub_candidates_df.drop(columns=['year', 'month'])

In [None]:

to_drop = [col for col in sub_candidates_df.columns.to_list() if col not in cols_to_keep]

sub_candidates_df = sub_candidates_df.drop(columns=to_drop)

In [None]:
sub_candidates_df[(sub_candidates_df.session_id==0)]

In [None]:
reranked_df = XGB_rerank(candidates_df=sub_candidates_df[sub_candidates_df.session_id < 3], xgb_model=xgb_model, cutoff=100)
reranked_df

In [None]:
create_submission_XGB(reranked_df=reranked_df, item_mapping=item_mapping, session_mapping=test_session_mapping)

In [None]:
candidates_df_1 = candidates_df.copy()

predictions = candidates_df_1.groupby('session_id').progress_apply(
    lambda x: xgb_model.predict(
        xgb.DMatrix(
            x.drop(columns=['session_id', 'item_id']),
            nthread=-1,
            missing=np.NaN,
        )
    )
)

print(predictions)

scores = []
for a in predictions.values:
    scores.extend(a)

# inverted because later I need to sort in ascending order for customer_id
# candidates_df['score'] = [-a for a in scores]

candidates_df_1['score'] = scores
candidates_df_1

In [None]:
candidates_df_1['session_id'] = candidates_df_1['session_id'].astype('int')

reranked_df = candidates_df_1.sort_values(by=['session_id', 'score'], inplace=False, ascending=[True, False])
reranked_df

In [None]:
reranked_df[reranked_df.session_id == 0]

In [None]:
reranked_df = reranked_df.groupby('session_id').head(100)
reranked_df