# Preprocessing for sales recommendations

## Loading the data

In [None]:
import numpy as np
from mba.data import (
    get_ffp_train_df,
    get_ffp_rollout_df,
    get_reviews_train_df_fpath,
    get_reviews_rollout_df_fpath,
)

In [None]:
raw_tdf = get_ffp_train_df()

In [None]:
raw_tdf

## Building the pipeline

In [None]:
from mba.shared import (
    Column,
    NUMERIC_COLUMNS,
    CATEGORICAL_COLUMNS,
    ORDINAL_COLUMNS,
    FeatureGroup,
    FEATURE_GROUPS,
    ContextKey,
)
from mba.pipeline import build_pipeline

In [None]:
pline = build_pipeline()

In [None]:
pline

In [None]:
tdf = pline.fit_transform(
    X=raw_tdf,
    verbose=True,
    context={
        ContextKey.REVIEWS_FPATH: get_reviews_train_df_fpath(),
    },
)

In [None]:
tdf

## Play with pycaret

In [None]:
from pycaret.classification import (
    setup,
    compare_models,
    create_model,
    tune_model,
    blend_models,
    predict_model,
    finalize_model,
    save_model,
    load_model,
    get_metrics,
    add_metric,
)
from sklearn.metrics import (
    accuracy_score, roc_auc_score, recall_score, precision_score, f1_score,
    confusion_matrix,
)

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTENC

In [None]:
clf_handle = setup(
    data = tdf,
    target = Column.BUYER_FLAG,
    train_size=0.8,
    session_id=42,
    numeric_features=NUMERIC_COLUMNS,
    categorical_features=CATEGORICAL_COLUMNS,
    group_features=[FEATURE_GROUPS[k] for k in FEATURE_GROUPS],
    group_names=[k for k in FEATURE_GROUPS],
    normalize=True,
    remove_perfect_collinearity=True,
    data_split_stratify=True,
    silent=True,
    fix_imbalance=True,
    fix_imbalance_method=SMOTENC(
        categorical_features=CATEGORICAL_COLUMNS,
    ),
) 

In [None]:
TP_REVENUE = 32.7
FP_REVENUE = -6.05
FN_COST = -32.7


def p_count(y_true, y_pred):
    return sum(y_true==1)

def n_count(y_true, y_pred):
    return sum(np.where((y_true==0), 1, 0))

def tp(y_true, y_pred):
    return sum(np.where((y_pred==1) & (y_true==1), 1, 0))

def fp(y_true, y_pred):
    return sum(np.where((y_pred==1) & (y_true==0), 1, 0))

def tn(y_true, y_pred):
    return sum(np.where((y_pred==0) & (y_true==0), 1, 0))

def fn(y_true, y_pred):
    return sum(np.where((y_pred==0) & (y_true==1), 1, 0))

def revenue_score(y_true, y_pred):
    tp_count = tp(y_true, y_pred)
    fp_count = fp(y_true, y_pred)
    return tp_count * TP_REVENUE + fp_count * FP_REVENUE

def opportunity_cost(y_true, y_pred):
    tp_count = tp(y_true, y_pred)
    fp_count = fp(y_true, y_pred)
    fn_count = fn(y_true, y_pred)
    return tp_count * TP_REVENUE + fp_count * FP_REVENUE + fn_count * FN_COST

In [None]:
add_metric(
    id='p_count',
    name='P',
    score_func=p_count,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='n_count',
    name='N',
    score_func=n_count,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='revenue_score',
    name='Total Revenue',
    score_func=revenue_score,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='opportunity_cost',
    name='Opportunity Cost',
    score_func=opportunity_cost,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='tp',
    name='TP',
    score_func=tp,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='fp',
    name='FP',
    score_func=fp,
    target='pred',
    greater_is_better=False,
    multiclass=False,
)
add_metric(
    id='tn',
    name='TN',
    score_func=tn,
    target='pred',
    greater_is_better=True,
    multiclass=False,
)
add_metric(
    id='fn',
    name='FN',
    score_func=fn,
    target='pred',
    greater_is_better=False,
    multiclass=False,
)

In [None]:
top3 = compare_models(n_select=3, sort='revenue_score')

In [None]:
def _sanity_check_all_ones_revenue(P, N):
    return P * TP_REVENUE + N * FP_REVENUE

In [None]:
_sanity_check_all_ones_revenue(P = 136 + 157, N=268+2639)

In [None]:
gbc = create_model('gbc')

In [None]:
gbc_res = predict_model(gbc)

In [None]:
sktuned_gbc = tune_model(
    gbc,
    fold=8,
    n_iter=10,
    optimize='revenue_score',
)

In [None]:
sktuned_gbc

In [None]:
sktuned_gbc_res = predict_model(sktuned_gbc)

## AdaBoost

In [None]:
ada = create_model('ada')

In [None]:
ada_res = predict_model(ada)

In [None]:
sktuned_ada = tune_model(
    ada,
    fold=10,
    n_iter=15,
    optimize='revenue_score',
)

In [None]:
sktuned_ada

In [None]:
sktuned_ada_res = predict_model(sktuned_ada)

## LGBM

In [None]:
lgbm = create_model('lightgbm')

In [None]:
lgbm_res = predict_model(lgbm)

In [None]:
sktuned_lgbm = tune_model(
    lgbm,
    fold=10,
    n_iter=10,
    optimize='revenue_score',
)

In [None]:
sktuned_lgbm_res = predict_model(sktuned_lgbm)

## ET

In [None]:
et = create_model('et')

In [None]:
et_res = predict_model(et)

In [None]:
sktuned_et = tune_model(
    et,
    fold=10,
    n_iter=10,
    optimize='revenue_score',
)

In [None]:
sktuned_et_res = predict_model(sktuned_et)

## Blending models

In [None]:
blender_top3 = blend_models(top3)

In [None]:
blender_res = predict_model(blender_top3)

In [None]:
blender_res

In [None]:
sktuned_m0 = tune_model(
    top3[0],
    fold=8,
    n_iter=10,
    optimize='revenue_score',
)

In [None]:
tuned_gbc = tune_model(
    gbc,
    fold=8,
    n_iter=10,
    optimize='revenue_score',
    search_library='optuna',
    early_stopping='Hyperband',
)

In [None]:
tuned_gbc

In [None]:
tuned_gbc_res = predict_model(tuned_gbc)