In [None]:
!aws codeartifact login --tool pip --domain cirrus-ml-ds-domain --domain-owner 813736554012 --repository cirrus-ml-ds-shared-repo
!pip install awswrangler --quiet
!pip install lightgbm --quiet
!pip install category_encoders --quiet
!pip install imbalanced-learn --quiet


In [None]:
import boto3
import awswrangler as wr
import pandas as pd
from lightgbm import LGBMClassifier
import numpy as np

boto3.setup_default_session(region_name="eu-north-1")

In [None]:
df = wr.athena.read_sql_query('SELECT * FROM "customerone_mock_data_rl"."master";', 
                              database="customerone_mock_data_rl",
                              workgroup="dev-athena-workgroup",
                             )
#                               workgroup="dev-ds-athena-workgroup",
df

In [None]:
# 74 columns
# [print(f"\n** {col}:\n{df[col].value_counts(dropna=False)} \n") for col in df.columns];

# 0- configurations

In [None]:
target_col_name = 'tgt_xsell_cust_voice_to_fixed'


# 1- sample_training_df()

In [None]:
df_training = df[df['tgt_xsell_cust_voice_to_fixed'].notnull()]

In [None]:
df_training

In [None]:
# drop_invalid_features


# 6 & 8- splitting

In [None]:
# {{model}}.training.splitting_params:
#   splitter: customerone.pipelines.telco.test_train_splitter.OutOfTimeSplitter
#   splitter_args: # Arguments to be passed into the splitter, see the splitter docs for details
#     n_splits: 1 # The n_splits argument controls the number of iterations
#     test_size: 0.2
#     date_col: "current_dt"
#     verbose: 1
#     random_state: 42
#   group_col_name: "customer_id" # column be useded as a group vector, if the selected spliter uses the group parameter. See the splitter docs for details
#   iteration_col_name: iteration_id # Name of the iteration ID column to be added
#   split_col_name: split # Name of the TRAIN/TEST/etc. split column

# {{model}}.training.subsplitting_params:
#   splitter: sklearn.model_selection.StratifiedShuffleSplit
#   splitter_args: # Arguments to be passed into the splitter, see the splitter docs for details
#     test_size: 0.1
#     random_state: 42 # n_splits is fixed to be 1 for sub-splitters. Any higher value will be ignored
#   source_split: TRAIN # The set to split
#   target_splits: # The two sets to split the source set into. Must be two targets
#     - TRAIN
#     - CAL

In [None]:
# Added to Sagemaker

from utils_split import make_splits, split_data, make_subsplit

In [None]:
# Added to Sagemaker
split_train_test = make_splits(df_training_idx, target_col_name)

In [None]:
print(f"{df_training.shape}, {df_training_idx.shape}, {split_train_test.shape}")
# (5896, 74), (5896, 76), (5896, 76)

In [None]:
split_train_test['split'].value_counts(dropna=False)
# TRAIN    4654
# TEST     1242
# Name: split, dtype: int64

In [None]:
df_training_idx['split'].value_counts(dropna=False)
# NaN    5896
# Name: split, dtype: int64

In [None]:
split_train_test['iteration_id'].value_counts(dropna=False)


In [None]:
# Added to Sagemaker

splitter_args = {'test_size': 0.1, 'random_state': 42}
subsplit_params = {"iteration_col_name": 'iteration_id', "split_col_name": "split", 'source_split': 'TRAIN', 'target_splits':['TRAIN', 'CAL']}

split_train_test_cal = make_subsplit(split_train_test, subsplit_params, splitter_args, target_col_name)
split_train_test_cal

In [None]:
# split_train_test['split'].value_counts(dropna=False)
# TRAIN    4654
# TEST     1242
split_train_test_cal['split'].value_counts(dropna=False)

# 7- feature types

In [None]:
# added to sagemaker
from utils_determine_feature_type import determine_feature_data_types

spine_params_determine_feature_data_types = {'keys': ['customer_id'], 
                'date_column': 'current_dt', 
                'product_holdings_filter': {'product_category': 'fixedbroadband'}, 
                'is_deepsell': 'N'}

target_params_determine_feature_data_types = {'target_variable_column': 'tgt_xsell_cust_voice_to_fixed'
                 , 'lead_time_window': '1d'
                 , 'target_window': '45d'
                 , 'product_activation_filter': {'product_sub_category': 'voice'}
                 , 'campaign_keys': ['customer_id']
                 , 'campaign_filter': {'campaign_name':["C-452-O-06 Korsförsäljning Telia Life 2.0 - Sälja mobilt"
                                            , "b2c_cross-sell_pp_crossSellPpToBb"
                                            , "b2c_cross-sell_pp_crossSellPpToBb_oldContent"
                                            , "b2c_cross-sell_pp_crossSellPpToBb_REM1"
                                            , "C-700-O-03 Cross-sell Mobile to Broadband customers (TM only)"
                                            , "C-700-O-01 Cross-sell Mobile to Broadband customers (A)"
                                            , "C-752-O GEOF 2021 - X-sell PP"
                                            , "C-652-O Black Friday Erbjudande 2019  Mobilt till BB-kund - activity 1"
                                            , "C-700-O-02 Cross-sell Mobile to Broadband customers (B)"
                                            , "b2c_cross-sell_Pp_PpToBb_default"
                                            , "C-752-O GEOF 2021 - Xsell PP"
                                            , "b2c_cross-sell_Pp_PpToBb_simOnly"
                                            , "b2c_cross-sell_Pp_PpToBb_samS215G"
                                            , "b2c_cross-sell_Pp_PpToBb_iphone12Mini"
                                            , "b2c_cross-sell_Pp_PpToBb_iphoneSE"
                                            , "b2c_cross-sell_Pp_PpToBb_iphone12"
                                            , "b2c_cross-sell_Pp_PpToBb_samS20FE5G"
                                            , "b2c_cross-sell_Pp_PpToBb_sonyXp10lll"
                                            , "b2c_cross-sell_Pp_PpToBb_default_short_8pm"
                                            , "b2c_cross-sell_Pp_PpToBb_default_8pm"]
                                       , 'customer_actioned_flg_column': {'Email':'actioned_ind'}}}
                 

feature_dict = determine_feature_data_types(df_training, spine_params_determine_feature_data_types, target_params_determine_feature_data_types)
print(f"len(feature_dict)={len(feature_dict)}, \nlen(feature_dict['numeric'])={len(feature_dict['numeric'])}, \nlen(feature_dict['categorical'])={len(feature_dict['categorical'])}")
feature_dict
# actual data: len(categorical)=7 , len(numerical)=150

# 10- preprocessing_pipeline_step

In [None]:
#                 node(
#                     func=preprocessing_pipeline_step,
#                     inputs={
#                         "params": f"{param_str}.preprocessing",
#                         "categorical": f"{model_name}.in_memory.categorical_features",
#                         "numerical": f"{model_name}.in_memory.numerical_features",
#                     },
#                     outputs=f"{catalog_str}.preprocessing_step",
#                     name=f"define_preprocessing",
#                     tags=["training: model"],
#                 )
            

# categorical_features = {'categorical': {'impute': {'class': 'customerone_lib.common.pkgs.predictive_modeling.preprocessing.PandasSimpleImputer',
#                                                     'args': {'strategy': 'most_frequent'}},
#                                         'encoder': {'class': 'category_encoders.target_encoder.TargetEncoder'}}}

# numerical_features = {'numerical':{'imputer': {'class': 'customerone_lib.common.pkgs.predictive_modeling.preprocessing.PandasSimpleImputer',
#                                                'args': {'strategy': 'mean'}}}}

# params_preprocessing = {'categorical': {'impute': PandasSimpleImputer(strategy='most_frequent'),
#                                         'encoder': TargetEncoder()}, 
#                         'numerical': {'imputer': PandasSimpleImputer()}}

%load_ext autoreload 
%autoreload 2
from utiles_preprocessing import preprocessing_pipeline_step


preprocessing_step = preprocessing_pipeline_step(feature_dict['categorical'], feature_dict['numeric'])
preprocessing_step

# before vs after imputation?!

# 11 & 2 estimator

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
# from sklearn.pipeline import Pipeline

def model_estimator(**kwargs):
# def model_estimator(params):
# def model_estimator(params: Dict[str, Any],) -> BaseEstimator:
    """
    Define a estimator

    `params` expects the following key value pairs:
        `estimator`: path to estimator class
        `args`: (Optional) arguments to be passed when initiating the estimator object

    or 

    `params` expects the following key value pairs:
        `pipeline`: path to pipeline class
        `args`: arguments to be passed when initiating the pipeline object.
            Must at least contain the `steps` key
            `steps`: steps defining transformers (implementing fit/transform/fit_resample), 
                in the order in which they are chained, with the last object being an estimator.  
                `step_name_1`: name of the first step
                    `transformer`: Path to transformer/estimator class
                    `args`: arguments to be passed to transformer/estimator instantiation
                `step_name_2`: name of the second step, follows the same structure as step one
                    ...

    Args:
        params: dictonary with the keys {'estimator': string, 'args': dict} 
            or {'pipeline': string, 'args': {'steps': dict}}

    Returns:
        estimator
    """

    estimator_pipeline = Pipeline(steps=[('over_sampler', SMOTE(random_state=42)),
                                         ('estimator', LGBMClassifier())])
    if len(kwargs):
        estimator_pipeline.steps.insert(kwargs.get('index'), (kwargs.get('step_name'), kwargs.get('step')))
    return estimator_pipeline

In [None]:
# estimator_params = {'pipeline': 'imblearn.pipeline.Pipeline', 
#                     'args': {'steps': {'over_sampler': {'class': 'imblearn.over_sampling.SMOTE', 
#                                                         'args': {'random_state': 42}}, 
#                                        'estimator': {'class': 'lightgbm.LGBMClassifier'}}}}
add_step = {'step':preprocessing_step, 'index': 0, 'step_name':'preprocessing'}
estimator = model_estimator(**add_step)
estimator

In [None]:
#   estimator = Pipeline(steps=[('preprocessing',
#                  ColumnTransformer(transformers=[('categorical',
#                                                   Pipeline(steps=[('impute',
#                                                                    PandasSimpleImputer(strategy='most_frequent')),
#                                                                   ('encoder',
#                                                                    TargetEncoder())]),
#                                                   ['dmgrphc_b_gender_typ',
#                                                    'dmgrphc_b_habits_desc_txt',
#                                                    'dmgrphc_b_age_bucket_txt',
#                                                    'dmgrphc_b_city_txt',
#                                                    'dmgrphc_b_lifestyle_desc_txt',
#                                                    'rev_m_bill_shock_eom_total_bill_amt...
#                                                    'product_holding_mobilebroadbandsubscription_active_30_to_60_days_avg',
#                                                    'product_holding_fixedbroadband_active_0_to_30_over_30_to_60_days_avg',
#                                                    'ci_d_sum_cmpl_over_contacts_in_0_30_days',
#                                                    'product_holding_fixedbroadband_closed_0_to_30_over_30_to_60_days_avg',
#                                                    'num_distinct_campaign_last_30_days', ...])])),
#                 ('over_sampler', SMOTE(random_state=42)),
#                 ('estimator', LGBMClassifier())])   


# node(
#     func=model_estimator,
#     inputs={
#         "params": f"{param_str}.estimator",
#     },
#     outputs=f"{model_name}.in_memory.estimator",
#     name=f"define_estimator",
#     tags=["training: model"],
# ),
# node(
#     func=partial(add_step, step_name="preprocessing"),
#     inputs={
#         "pipeline": f"{model_name}.in_memory.estimator",
#         "step": f"{catalog_str}.preprocessing_step"
#     },
#     outputs=f"{catalog_str}.estimator",
#     name=f"add_preprocessing_step_to_estimator",
#     tags=["training: model"],
# )
        
        
# {{model}}.training.estimator:
#   pipeline: imblearn.pipeline.Pipeline
#   args:
#     steps:
#       over_sampler: 
#         class: imblearn.over_sampling.SMOTE
#         args:
#           random_state: 42
#       estimator: 
#         class: lightgbm.LGBMClassifier        


# 12- tune_parameters

In [None]:
# line features is already added to sagemaker!!

%load_ext autoreload 
%autoreload 2
from utils_tune_parameters import tune_parameters


tuning_params = {'tuner': 'sklearn.model_selection.RandomizedSearchCV', 
                 'tuner_args': {'scoring': 'roc_auc', 'verbose': 2, 'n_iter': 10, 
                                'param_distributions': {'preprocessing__categorical__encoder__min_samples_leaf': [100, 500, 1000, 5000, 10000],
                                                         'preprocessing__categorical__encoder__smoothing': [1.0, 5.0, 10.0, 25.0], 
                                                         'over_sampler__sampling_strategy': [0.5, 0.35, 0.25, 0.15, 0.1, 'not minority'], 
                                                         'estimator__boosting_type': ['gbdt', 'dart'], 
                                                         'estimator__num_leaves': [20, 31, 50, 100], 
                                                         'estimator__min_child_samples': [100, 250, 500, 1000], 
                                                         'estimator__min_child_weight': [0.01, 0.001], 
                                                         'estimator__max_depth': [5, 10, 15, -1], 
                                                         'estimator__learning_rate': [0.2, 0.15, 0.1, 0.05, 0.01], 
                                                         'estimator__n_estimators': [50, 100, 150], 
                                                         'estimator__colsample_bytree': [0.8, 1.0], 
                                                         'estimator__subsample': [0.8, 1.0], 
                                                         'estimator__subsample_freq': [1], 
                                                         'estimator__reg_alpha': [0.0, 0.1],
                                                         'estimator__reg_lambda': [0.0, 0.1],
                                                         'estimator__min_split_gain': [0.0, 0.1],
                                                         'estimator__class_weight': ["balanced", []],
                                                        }}, 
                 'cv': 'test_train_splitter.OutOfTimeSplitter', 
                 'cv_args': {'n_splits': 3, 'test_size': 0.45, 'date_col': 'current_dt', 'random_state': 42, 'verbose': 1}}

features = feature_dict['categorical'] + feature_dict['numeric']

splitting_params = {'splitter': 'test_train_splitter.OutOfTimeSplitter', 
                    'splitter_args': {'n_splits': 1, 'test_size': 0.2, 'date_col': 'current_dt', 'verbose': 1, 'random_state': 42}, 
                    'group_col_name': 'customer_id', 
                    'iteration_col_name': 'iteration_id', 
                    'split_col_name': 'split'}

target = 'tgt_xsell_cust_voice_to_fixed'

tuned_parameters, best_tuning_score = tune_parameters(split_train_test_cal, estimator, tuning_params, features, splitting_params, target_col_name)


In [None]:
# node(
#     func=tune_parameters,
#     inputs={
#         "data": f"{catalog_str}.split_train_test_cal",
#         "estimator": f"{catalog_str}.estimator",
#         "tuning_params": f"{param_str}.tuning_params",
#         "features": f"{model_name}.in_memory.features",
#         "splitting_params": f"{param_str}.splitting_params",
#         "target_col_name": f"{param_str}.target",
#     },
#     outputs=[
#         f"{catalog_str}.tuned_parameters",
#         f"{catalog_str}.best_tuning_score"
#     ],
#     name=f"tune_model_params_for_training",
#     tags=["training: model"],
# )
    
    
# # No features are specified for this model as they are found automatically in feature selection
# {{model}}.training.tuning_params:
#   tuner: sklearn.model_selection.RandomizedSearchCV
#   tuner_args:
#     scoring: roc_auc
#     verbose: 2
#     n_iter: 10
#     # if estimator is a sklearn/imblearn pipeline, then the param_distributions 
#     # needs to be prefixed with step name, e.g '{step_name}__{parameter}'
#     param_distributions:
#       preprocessing__categorical__encoder__min_samples_leaf: [100, 500, 1000, 5000, 10000]
#       preprocessing__categorical__encoder__smoothing: [1.0, 5.0, 10.0, 25.0] # float is needed
#       # 'not minority' acts as a 'passthrough' for SMOTE
#       over_sampler__sampling_strategy: [0.5, 0.35, 0.25, 0.15, 0.1, "not minority"]
#       estimator__boosting_type: ["gbdt", "dart"]
#       estimator__num_leaves: [20, 31, 50, 100]
#       estimator__min_child_samples: [100, 250, 500, 1000]
#       estimator__min_child_weight: [0.01, 0.001]
#       estimator__max_depth: [5, 10, 15, -1]
#       estimator__learning_rate: [0.2, 0.15, 0.1, 0.05, 0.01]
#       estimator__n_estimators: [50, 100, 150]
#       estimator__colsample_bytree: [0.8, 1.0]
#       estimator__subsample: [0.8, 1.0]
#       estimator__subsample_freq: [1]
#       estimator__reg_alpha: [0.0, 0.1]
#       estimator__reg_lambda: [0.0, 0.1]
#       estimator__min_split_gain: [0.0, 0.1]
#       estimator__class_weight: ["balanced", []] # empty list [] acts as None, all classes gets weight one
#   cv: customerone.pipelines.telco.test_train_splitter.OutOfTimeSplitter
#   cv_args:’”‘
#     n_splits: 3
#     test_size: 0.45
#     date_col: current_dt
#     random_state: 42
#     verbose: 1    

        
        
   

In [None]:
tuned_parameters

In [None]:
best_tuning_score

# 13- fit_model_for_training

In [None]:
%load_ext autoreload 
%autoreload 2
from utils_train_model import train_model


if len(tuned_parameters) > 1:
    ValueError("Sarah: There are more than one tuned_parameters available!!!! Can't start train_model()")
fit_models = train_model(split_train_test_cal, tuned_parameters[0], features, splitting_params, target_col_name, estimator)
fit_models


In [None]:
# node(
#     func=train_model,
#     inputs={
#         "data": f"{catalog_str}.split_train_test_cal",
#         "estimator": f"{catalog_str}.estimator",
#         "model_params": f"{catalog_str}.tuned_parameters",
#         "features": f"{model_name}.in_memory.features",
#         "splitting_params": f"{param_str}.splitting_params",
#         "target_col_name": f"{param_str}.target",
#     },
#     outputs=f"{catalog_str}.fit_models",
#     name=f"fit_model_for_training",
#     tags=["training: model"],
# )

In [None]:
len(fit_models)

# 14- calibrate classifires

In [None]:
from imblearn.pipeline import Pipeline as imblearn_pipeline
from sklearn.pipeline import Pipeline as sklearn_pipeline
from sklearn.calibration import CalibratedClassifierCV

def calibrate_classifiers(data, trained_models, splitting_params, calibration_params, target_col_name):
# def calibrate_classifiers(data: pd.DataFrame, trained_models: Mapping[str, Any], splitting_params: Mapping[str, Any], calibration_params: Optional[Mapping[str, Any]] = None,) -> Mapping[str, Any]:
    """Calibrates prefit sklearn compatible models using `CalibratedClassifierCV`.

    `splitting_params` expects the following key value pairs:
        `iteration_col_name`: Name of column in which the fold id is to be saved
        `split_col_name`: Name of column in which the TRAIN/TEST split is saved

    `calibration_params` may be used with following optional key value pairs:
        `method`: (Optional) Calibration method, `sigmoid` or `isotonic`. Default is
            `sigmoid`
        `calibration_split_name`: (Optional) Split on which model will be calibrated,
            default is `CAL`

    Args:
        data: A dataframe containing features, targets, fold and split information
        trained_models: Prefit sklearn compatible classifiers
        splitting_params: Describe where fold and splitting information are located
        calibration_params: Describe which method and data to use

    Returns:
        A dictionary with the fold id as key and the model calibrated on that fold
            as value
    """
    calibration_params = {} if calibration_params is None else calibration_params
    iteration_col_name = splitting_params.get("iteration_col_name", "iteration_id")
    split_col_name = splitting_params.get("split_col_name", "split")

    calibration_method = calibration_params.get("method", "sigmoid")
    calibration_split_name = calibration_params.get("calibration_split_name", "CAL")
#     print(f"  calibration_params={calibration_params}, \n  iteration_col_name={iteration_col_name}, \n  split_col_name={split_col_name}, \n  calibration_method={calibration_method}, \n  calibration_split_name={calibration_split_name}")

    calibrated_model = data.groupby(iteration_col_name).apply(
                                _calibrate_models_per_fold,
                                trained_models,
                                calibration_method,
                                calibration_split_name,
                                split_col_name,
                                ).to_dict()
#     print(f"calibrated_model={calibrated_model}")
    for k in calibrated_model:
#         print(f"\t k = {k}")
        trained_models[k]['calibrated_model'] = calibrated_model[k]
    print(f"len(trained_models) = {len(trained_models)}")
    print(f"trained_models.keys = {trained_models.keys()}")
    return trained_models
    
#     return (
#         data.groupby(iteration_col_name)
#         .apply(
#             _calibrate_models_per_fold,
#             trained_models,
#             calibration_method,
#             calibration_split_name,
#             split_col_name,
#         )
#         .to_dict()
#     )


def _calibrate_models_per_fold(partition, trained_models, calibration_method, calibration_split_name, split_col_name):
# def _calibrate_models_per_fold(partition: pd.DataFrame, trained_models: Mapping[str, Any], calibration_method: AnyStr, calibration_split_name: AnyStr, split_col_name: AnyStr,) -> Any:
    """Calibrates a single pretrained model using `CalibratedClassifierCV`."""
    print("\n\n")
#     print(f"calibration_method={calibration_method}")
    iteration_id = partition.name
    prefit_model = trained_models[iteration_id]
#     print(f"iteration_id={iteration_id}, prefit_model={prefit_model}")

#     calibrated_model = prefit_model.calibrate(
#         partition, calibration_method, calibration_split_name, split_col_name
#     )
    
    # calibrate method from ModelContainer starts!!!! 
    fitted_model = prefit_model.get('fitted_model')
    feature_col_names = prefit_model.get("feature_col_names")
    fitted_model = prefit_model.get("fitted_model")
#     print(f"fitted_model = {fitted_model}")
#     print(f"feature_col_names = {feature_col_names}")
    _check_is_fitted(fitted_model)
    
    index_cal = partition[split_col_name] == calibration_split_name
    data_cal = partition.loc[index_cal].copy()
    X_train = data_cal[feature_col_names]
    y_train = data_cal[target_col_name]
    y_train = y_train.astype(int)
#     print(f"\n index_cal.shape = {index_cal.shape}")
#     print(f"data_cal.shape = {data_cal.shape}")
#     print(f"X_train.shape = {X_train.shape}")
#     print(f"y_train.shape = {y_train.shape}")

    if isinstance(fitted_model, (imblearn_pipeline, sklearn_pipeline)):
        print("11111111")
        pipeline = fitted_model
        estimator_tuple = pipeline.steps[-1]
#         print(f"estimator_tuple={estimator_tuple}")
        preprocessor =  pipeline.steps[0][1]
        calibrator = CalibratedClassifierCV(base_estimator=estimator_tuple[1], cv="prefit", method=calibration_method)
        calibrated_model = calibrator.fit(preprocessor.transform(X_train), y_train)
        pipeline.steps[-1] = (estimator_tuple[0], calibrator)
    else:
        print("222222222222")
        calibrator = CalibratedClassifierCV(base_estimator=fitted_model, method=calibration_method, cv="prefit")

        calibrated_model = calibrator.fit(X_train, y_train)

    return calibrated_model
#     print(f"calibrated_model = {calibrated_model}")
#     print(f"len(trained_models[{iteration_id}]) = {len(trained_models[iteration_id])}, trained_models[{iteration_id}] = {trained_models[iteration_id].keys()}")
#     trained_models[iteration_id]['calibrated_model'] = calibrated_model
#     print("********************")
#     print(f"\n\nlen(trained_models[{iteration_id}]) = {len(trained_models[iteration_id])}, trained_models[{iteration_id}] = {trained_models[iteration_id].keys()}")    
#     print("---------------------------------------")
#     return trained_models


def _check_is_fitted(fitted_model):
    """Checks whether the model has been fitted already."""
    if fitted_model is None:
        raise NotFittedError("This estimator has not been fitted yet.")

In [None]:
print(f"Before calibrate_classifiers(): \nlen(fit_models) = {len(fit_models)}, keys={len(fit_models[0].keys())}, and keys are: \n {fit_models[0].keys()}")

In [None]:
calibration_params = {'method': 'sigmoid', 'calibration_split_name': 'CAL'}

calibrated_model = calibrate_classifiers(split_train_test_cal, fit_models, splitting_params, calibration_params, target)


In [None]:
print(f"After calibrate_classifiers(): \nlen(fit_models) = {len(fit_models)}, keys={len(fit_models[0].keys())}, and keys are: \n {fit_models[0].keys()}")

In [None]:
calibrated_model

In [None]:
# node(
#     func=calibrate_classifiers,
#     inputs={
#         "data": f"{catalog_str}.split_train_test_cal",
#         "trained_models": f"{catalog_str}.fit_models",
#         "splitting_params": f"{param_str}.splitting_params",
#         "calibration_params": f"{param_str}.calibration_params",
#     },
#     outputs=f"{catalog_str}.calibrated_model",
#     name=f"calibrate_model_for_training",
#     tags=["training: model"],
# )

# 16- get_model_predictions

# <i> Note: calibrated_model is the same as fited_models


In [None]:
def get_model_predictions(data, trained_models, splitting_params, inference_params, target_col_name):
# def get_model_predictions(data: pd.DataFrame, trained_models: Mapping[str, Any], splitting_params: Mapping[str, Any], inference_params: Mapping[str, Any] = None,) -> pd.DataFrame:
    """Makes predictions for each fold given a dictionary of trained models.

    Predictions are saved into a new column. If the trained model implements a
    `predict_proba` method and is a binary classifier, then the prediction
    score for the second class is stored in an additional new columns with the name
    `target` + `prediction_proba_suffix`. The `prediction_proba_suffix` can be provided
    in the `inference_params`.

    `inference_params` expects the following key value pairs:
        `prediction_suffix`: (Optional) Suffix appended to target to form prediction
            column name. Default is `_pred`
        `prediction_proba_suffix`: (Optional) Suffix appended to target to form
            prediction score name. Default is `_pred_score`

    `splitting_params` expects the following key value pairs:
        `iteration_col_name`: Name of column in which the fold id is to be saved

    Args:
        data: Data containing fold id and features
        trained_models: A dictionary with fold ids as keys and trained models as values
        splitting_params: A dictionary describing how splitting is done, including
            in which column the iteration id resides
        inference_params: (Optional) Parameters including the suffix to use
            for predictions

    Returns:
        A new dataframe with the predictions as a new column, and the prediction scores
        if a `predict_proba` method is available and it's a binary classifier.
    """
    iteration_col_name = splitting_params.get("iteration_col_name", "iteration_id")

    inference_params = inference_params if inference_params else {} # {'prediction_suffix': '_pred'}

    prediction_suffix = inference_params.get("prediction_suffix", "_pred") # '_pred'
    prediction_proba_suffix = inference_params.get("prediction_proba_suffix", "_pred_score") # '_pred_score'
    

    print(f"  iteration_col_name={iteration_col_name}, \n  inference_params={inference_params}\n  prediction_suffix={prediction_suffix} \n  prediction_proba_suffix={prediction_proba_suffix} \n")
    
    return data.groupby(iteration_col_name).apply(
        _make_predictions_per_fold,
        trained_models,
        prediction_suffix,
        prediction_proba_suffix,
    )


def _make_predictions_per_fold(partition, trained_models, prediction_suffix, prediction_proba_suffix):
# def _make_predictions_per_fold(partition: pd.DataFrame, trained_models: Mapping[str, Any], prediction_suffix: str, prediction_proba_suffix: str,):
    """Makes predictions for one fold.

    If the model implements a `predict_proba` method and is a binary
    classifier, then the prediction score is calculated, too.
    """
#     print("\n\n****************")
#     print(f"trained_models: {trained_models}")
    iteration_id = partition.name # 0
#     print(f"iteration_id={iteration_id}")
    model = trained_models[iteration_id]
#     print(f"model.keys = {model.keys()}")

    target_col_name = model.get('target_col_name')
#     print(f"\ntarget_col_name={target_col_name}")
    
    pred_col_name = target_col_name + prediction_suffix
    pred_score_col_name = (
        target_col_name + prediction_proba_suffix
        if hasattr(model.get("fitted_model"), "predict_proba")
        else None
    ) # 'tgt_xsell_cust_voice_to_fixed_pred_score'

#     print(f"pred_col_name={pred_col_name}, \npred_score_col_name={pred_score_col_name}")
    
    # predict model from ModelContainer starts!!! 
#     partition = model.predict(partition, pred_col_name, pred_score_col_name)
#         _check_is_fitted(self.fitted_model)

    if (pred_col_name is None) and (pred_score_col_name is None):
        raise TypeError(
            "Expected pred_col_name or pred_score_col_name, neither was given."
        )

    if pred_col_name is not None:
#         print("1111111111")
#         print(f"len feature_col_names = {len(model.get('feature_col_names'))}")
        prediction = model.get("fitted_model").predict(partition[model.get("feature_col_names")]) # len(self.feature_col_names) = 160, rediction.shape = (326297,)
        partition[pred_col_name] = prediction # (326297, 166)
#         print(f"prediction = {prediction}")

    if pred_score_col_name is not None:

        if not hasattr(model.get("fitted_model"), "predict_proba"):
#             print("22222222222")
            raise AttributeError(
                f"A pred_score_col_name was passed, but {type(model.get('fitted_model'))} "
                + "has no attribute predict_proba. This indicates that the model "
                + "is not a classifier and can not predict a score. "
                + "Remove the pred_score_col_name argument or change "
                + "the model to a classifier."
            )

        prediction_score = model.get("fitted_model").predict_proba(partition[model.get("feature_col_names")]) # (326297, 2)
#         print(f"\nprediction_score.shape = {prediction_score.shape}")
        if len(model.get("fitted_model").classes_) > 2:
#             print("3333333333333333333")
            class_score_names = [
                pred_score_col_name + "_" + str(category)
                for category in model.get("fitted_model").classes_
            ]
            for (i, class_score_name) in enumerate(class_score_names):
                partition[class_score_name] = prediction_score[:, i]

        else:
#             print("44444444444444444444")
            partition[pred_score_col_name] = prediction_score[:, -1] # goes to else!

#     print(f"partition.shape={partition.shape}")
    return partition

In [None]:
calibrated_model[0].keys()

In [None]:
inference_params = {'prediction_suffix': '_pred'}

# calibrated_model is the same as fited_models

predictions = get_model_predictions(split_train_test_cal, calibrated_model, splitting_params, inference_params, target)
predictions

In [None]:
# node(
#     func=get_model_predictions,
#     inputs={
#         "data": f"{catalog_str}.split_train_test_cal",
#         "trained_models": f"{catalog_str}.calibrated_model",
#         "splitting_params": f"{param_str}.splitting_params",
#         "inference_params": f"{param_str}.inference_params",
#     },
#     outputs=f"{catalog_str}.predictions",
#     name=f"prediction_for_training",
#     tags=["training: predictions"],
# )

# 17- get_model_performance

In [None]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


# pylint: disable=too-many-locals
def get_model_performance(data, metrics_params, splitting_params, target_col_name, inference_params):
# def get_model_performance(data: pd.DataFrame, metrics_params: Mapping[str, Any], splitting_params: Mapping[str, Any], target_col_name: str, inference_params: Mapping[str, Any] = None,) -> pd.DataFrame:
    """Get model performance.

    Saves train/test metrics specified in the parameters file, for individual
    folds and overall to PAI and returns them in a dictionary.

    `metrics_params` expects the following key value pairs:
        `metrics`: sklearn performance metrics
        `tracker`: tracker name for logging metrics

    `splitting_params` expects the following key value pairs:
        `iteration_col_name`: Name of column in which the fold id is to be saved
        `split_col_name`: name of column in which train/test split is given

    Args:
        data: Dataframe containing features, targets, predictions, fold and
            split information
        metrics_params: contains information about metrics to compute and the tracker
            for logging the metrics.
        splitting_params: Describes where fold and splitting info are located
        target_col_name: Name of target column
        inference_params: (Opt) Contains the prediction suffixes.

    Returns:
        Dictionary containing train and test metrics for each fold and for
        overall data.

    """

    iteration_col_name = splitting_params.get("iteration_col_name", "iteration_id")
    split_col_name = splitting_params.get("split_col_name", "split")

    inference_params = inference_params if inference_params else {}

    prediction_suffix = inference_params.get("prediction_suffix", "_pred")
    prediction_proba_suffix = inference_params.get(
        "prediction_proba_suffix", "_pred_score"
    )

    prediction_col_name = target_col_name + prediction_suffix
    pred_score_col_name = target_col_name + prediction_proba_suffix

    model_metrics = {}
    scorers = []

    metric_params = metrics_params["metrics"]
    tracker_name = metrics_params.get("tracker")
    
#     print(f"\titeration_col_name= {iteration_col_name}, \n\tsplit_col_name = {split_col_name}, \n\tinference_params={inference_params},  \n\tprediction_suffix={prediction_suffix}, \n\tprediction_proba_suffix={prediction_proba_suffix} , \n\tpred_score_col_name={pred_score_col_name}, \n\tmetric_params={metric_params}, \n\ttracker_name={tracker_name}")
    
    """Computes the value of the metric for train and test sets."""
    train_index = data[split_col_name] == "TRAIN"
    test_index = data[split_col_name] == "TEST"

    y_train = data.loc[train_index, target_col_name]
    y_pred_train = data.loc[train_index, prediction_col_name]

    y_test = data.loc[test_index, target_col_name]
    y_pred_test = data.loc[test_index, prediction_col_name]
    

    scorer_name = 'roc_auc_score'
#     print(f"\n--------------------- {scorer_name}")
#     roc_auc_score(y_true, y_score, *, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None)
    train_metric_value_roc = roc_auc_score(y_train.values.astype(int), y_pred_train.values.astype(int))
    test_metric_value_roc = roc_auc_score(y_test.values.astype(int), y_pred_test.values.astype(int))
#     print(f"train_metric_value={train_metric_value_roc}, \t test_metrics_value={test_metric_value_roc}")
    scorers.append(f"train_{scorer_name}")
    scorers.append(f"test_{scorer_name}")
    model_metrics = _append_float_metric(model_metrics,train_metric_value_roc,test_metric_value_roc,target_col_name,"overall",scorer_name)
    
    
    scorer_name = 'balanced_accuracy_score'
#     print(f"\n--------------------- {scorer_name}")
#     balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False)
    train_metric_value_balanced = balanced_accuracy_score(y_train.values.astype(int), y_pred_train.values.astype(int))
    test_metric_value_balanced = balanced_accuracy_score(y_test.values.astype(int), y_pred_test.values.astype(int))
#     print(f"train_metric_value={train_metric_value_balanced}, \t test_metric_value={test_metric_value_balanced}")
    scorers.append(f"train_{scorer_name}")
    scorers.append(f"test_{scorer_name}")
    model_metrics = _append_float_metric(model_metrics,train_metric_value_balanced,test_metric_value_balanced,target_col_name,"overall",scorer_name)
  

    scorer_name = 'f1_score'
#     print(f"\n--------------------- {scorer_name}")
#     f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
    train_metric_value_f1 = f1_score(y_train.values.astype(int), y_pred_train.values.astype(int))
    test_metric_value_f1 = f1_score(y_test.values.astype(int), y_pred_test.values.astype(int))
#     print(f"train_metric_value={train_metric_value_f1}, \t test_metric_value={test_metric_value_f1}")
    scorers.append(f"train_{scorer_name}")
    scorers.append(f"test_{scorer_name}")
    model_metrics = _append_float_metric(model_metrics,train_metric_value_balanced,test_metric_value_balanced,target_col_name,"overall",scorer_name)
    
    
    scorer_name = 'precision_score'
#     print(f"\n--------------------- {scorer_name}")
#     precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
    train_metric_value_precision_score = precision_score(y_train.values.astype(int), y_pred_train.values.astype(int))
    test_metric_value_precision_score = precision_score(y_test.values.astype(int), y_pred_test.values.astype(int))
#     print(f"train_metric_value={train_metric_value_precision_score}, \t test_metric_value={test_metric_value_precision_score}")
    scorers.append(f"train_{scorer_name}")
    scorers.append(f"test_{scorer_name}")
    model_metrics = _append_float_metric(model_metrics,train_metric_value_balanced,test_metric_value_balanced,target_col_name,"overall",scorer_name)

    
    scorer_name = 'recall_score'
#     print(f"\n--------------------- {scorer_name}")
#     recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
    train_metric_value_recall_score = recall_score(y_train.values.astype(int), y_pred_train.values.astype(int))
    test_metric_value_recall_score = recall_score(y_test.values.astype(int), y_pred_test.values.astype(int))
#     print(f"train_metric_value={train_metric_value_recall_score}, \t test_metric_value={test_metric_value_recall_score}")
    scorers.append(f"train_{scorer_name}")
    scorers.append(f"test_{scorer_name}")
    model_metrics = _append_float_metric(model_metrics,train_metric_value_balanced,test_metric_value_balanced,target_col_name,"overall",scorer_name)
    
    
    scorer_name = 'confusion_matrix'
#     print(f"\n--------------------- {scorer_name}")
#     confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)
    train_metric_value_confusion_matrix = confusion_matrix(y_train.values.astype(int), y_pred_train.values.astype(int))
    test_metric_value_confusion_matrix = confusion_matrix(y_test.values.astype(int), y_pred_test.values.astype(int))
#     print(f"train_metric_value={train_metric_value_confusion_matrix}, \t test_metric_value={test_metric_value_confusion_matrix}")
    score_names = ["true_negative","false_positive","false_negative","true_positive",]
    for name in score_names:
        scorers.append(f"train_{name}")
        scorers.append(f"test_{name}")
    model_metrics = _append_confusion_matrix(model_metrics,train_metric_value_confusion_matrix, test_metric_value_confusion_matrix, target_col_name,"overall",)

    print(f"\n\nmodel_metrics = {model_metrics['overall_tgt_xsell_cust_voice_to_fixed']}")
    print(f"\nscorers = {scorers}")
    print(f"\n\nmodel_metrics = {len(model_metrics['overall_tgt_xsell_cust_voice_to_fixed'])}")
    print(f"\nscorers = {len(scorers)}")    

#     result = (pd.DataFrame.from_dict(model_metrics, orient="index", columns=scorers)
#               .reset_index()
#               .rename(columns={"index": "group"}))
# #     tracking.get_tracker(tracker_name).log_artifacts({"results": result})
#     return result
    return "code is not complete!! "


def _append_float_metric(metrics_dict, train_metric_value, test_metric_value, target_col_name, key_name, scorer_name,):
    metrics_dict = _add_value_to_dict(
        train_metric_value, f"{key_name}_{target_col_name}", metrics_dict
    )
    metrics_dict[f"{key_name}_{target_col_name}"].append(test_metric_value)

    return metrics_dict


def _add_value_to_dict(value, key_name, metric_dict):
    """Checks if a key is in the dictionary and adds the required value."""
    if key_name in metric_dict:
        metric_dict[key_name].append(value)
    else:
        metric_dict[key_name] = [value]

    return metric_dict

def _append_confusion_matrix( metrics_dict, train_metric_value, test_metric_value, target_col_name, key_name):
    if len(train_metric_value) == 2:
    # if train_metric_value.shape[0] == 2:
        i =0 
        for train_value, test_value in zip(
            # train_metric_value.ravel(), test_metric_value.ravel()
            train_metric_value, test_metric_value
        ):
#             print(f"i={i}")
#             i = i + 1
#             print("train_value", train_value)
#             print("test_value", test_value)
            metrics_dict = _add_value_to_dict(
                train_value, f"{key_name}_{target_col_name}", metrics_dict
            )
            metrics_dict[f"{key_name}_{target_col_name}"].append(test_value)

    else:
        raise ValueError(
            "Performance table doesn't support confusion matrices for multi-class "
            "classification"
        )

    return metrics_dict

In [None]:
metrics_params = {'tracker': 'exp', 
                  'metrics': ['sklearn.metrics.roc_auc_score', 
                              'sklearn.metrics.balanced_accuracy_score', 
                              'sklearn.metrics.f1_score', 
                              'sklearn.metrics.precision_score', 
                              'sklearn.metrics.recall_score', 
                              'sklearn.metrics.confusion_matrix']}
inference_params = {'prediction_suffix': '_pred'}

performance = get_model_performance(predictions, metrics_params, splitting_params, target, inference_params)
performance

In [None]:
# node(
#     func=get_model_performance,
#     inputs={
#         "data": f"{catalog_str}.predictions",
#         "metrics_params": f"{param_str}.metrics_params",
#         "splitting_params": f"{param_str}.splitting_params",
#         "target_col_name": f"{param_str}.target",
#         "inference_params": f"{param_str}.inference_params",
#     },
#     outputs=f"{catalog_str}.performance",
#     name=f"performance_for_training",
#     tags=["training: performance"],
# )

# Inference

In [None]:
import re

def to_pandas(data, pandas_params, spine_params, target_col, data_dictionary, features, ref_date):
# def to_pandas(data: DataFrame, pandas_params: Dict[str, Any], spine_params: Dict[str, Any], target_params: Dict[str, Any], data_dictionary: Optional[DataDict] = None, features: List[str] = None, ref_date: Optional[str] = None,) -> pd.DataFrame:
    ref_period = pandas_params.get("ref_period")
    include_target = pandas_params.get("include_target")
    key_col = to_list(spine_params.get('keys'))
    date_col = spine_params.get('date_column')
#     target_col = target_params.get('target_variable_column')

    if ref_date and ref_period:
        raise AttributeError(
            "Cannot specify both ref_date and ref_period at"
            "the same time. Specify only one argument."
        )
        
    print(f"\tref_period={ref_period}, \n\tinclude_target={include_target}, \n\tkey_col={key_col}, \n\tdate_col={date_col}, \n\ttarget_col={target_col}, \n\tdata_dictionary={data_dictionary}")
    print(f"len(features)= {len(features)}")
    
    feat_col, target_switch_col = [None] * 2
    
    if data_dictionary or features:
        if data_dictionary:
            feat_col = data_dictionary.get_features()
            target_switch_col = data_dictionary.get_target_switch_variable()
        elif features:
            feat_col = features

        export_cols = []
        if date_col:
            export_cols += [date_col]
        if key_col:
            export_cols += key_col
        export_cols += feat_col
        if include_target and target_col:
            export_cols += [target_col]

        cols_diff = set(export_cols) - set(data.columns)
        if len(cols_diff) != 0:
            # logger.warning(f"The following columns are not available in the DataFrame and will be ignored: {cols_diff}")
            export_cols = list(set(export_cols) - cols_diff)

        print(f"len(export_cols)={len(export_cols)}")
        print(f"Before: {data.shape}")
        data = data[export_cols]
        print(f"After: {data.shape}")        

    if ref_date and date_col:
        try:
            if not re.match(r"\d{4}-\d{2}-\d{2}", ref_date):
                raise ValueError("Date format should follow yyyy-mm-dd.")

            ref_date = pd.to_datetime(ref_date, infer_datetime_format=True).strftime("%Y-%m-%d")
            print(f"ref_date={ref_date}, type = {type(ref_date)}")
            
            data['dt'] = pd.to_datetime(data[date_col], format='%Y-%m-%d %H:%M:%S')
            data = data[data['dt'] == ref_date]
            print(f"After filter for day {ref_date} data.shape = {data.shape}")            
            data = data.drop(columns=['dt'])
            print(f"After drop filter for day {ref_date} data.shape = {data.shape}")
        except (ValueError, TypeError) as exception:
            raise exception("'ref_date' type casting failed") from exception

    # TODO: Sarah The followings are not tested!! 
    print(f"first if = {include_target and target_col}, 2nd if={include_target and target_switch_col}, 3rd if={ref_period and date_col}")
    if include_target and target_col:
        print(f"\nBefore include_target shape is {data.shape}")
        data = data[data[target_col.notnull()]]
        print(f"After include_target shape is {data.shape}")

    if include_target and target_switch_col:
        print(f"\nBefore include_target and target_switch_col shape is {data.shape}")
        # data = data.filter(~f.col(target_switch_col))
        data = data[data[target_switch_col]]
        print(f"After include_target and target_switch_col shape is {data.shape}")
                    
    if ref_period and date_col:
        print(f"\nBefore filter_ref_window shape is {data.shape}")
        # data = filter_ref_window(data, ref_period, date_col)
        print(f"After filter_ref_window shape is {data.shape}")
    
    return data


def to_list(item):
    """
    Wraps an item to a list or returns the existing list

    Args:
        item (Union[Any, List[Any]]): a list of a single object

    Returns:
        item wrapped in a list, if item is not a list. Otherwise the
        original list is returned
    """

    return [item] if not isinstance(item, list) else item

In [None]:
pandas_params = {'include_target': False}
spine_params = {'keys': ['customer_id'], 
                'date_column': 'current_dt', 
                'product_holdings_filter': {'exact_match': {'product_category': 'fixedbroadband'}}, 
                'is_deepsell': 'N'}
ref_date = '2021-06-30'
data_dictionary = None
inference_master_table = to_pandas(df, pandas_params, spine_params, target, data_dictionary, features, ref_date)
inference_master_table


In [None]:
# node(
#     func=spark_utils.to_pandas,
#     inputs={
#         "data": f"{model_name}.master.mst_tbl",
#         "pandas_params": f"{param_str}.inference_spark_to_pandas",
#         "spine_params": f"params:{model_name}.master.spine",
#         "target_params": f"params:{model_name}.master.target",
#         "features": f"{model_name}.in_memory.features_to_export",
#         "ref_date": "params:ref_date",
#     },
#     outputs=f"{model_name}.in_memory.inference_master_table",
#     name=f"master_table_to_pandas_for_inference",
#     tags=["inference: CDL to edge"],
# ),

# attach_live_iter_split_col

In [None]:
def attach_live_iter_split_col(data, splitting_params):
# def attach_live_iter_split_col(data: pd.DataFrame, splitting_params: Mapping[str, Any]) -> pd.DataFrame:
    """Adds fold and split columns to any dataframe.

    The fold is set to 0, split is set to "LIVE". This is useful for live data from
    e.g. a UI.

    `splitting_params` expects the following keys:
        `iteration_col_name`: Name of column in which the fold id is to be saved
        `split_col_name`: Name of column in which the TRAIN/TEST split is saved

    Args:
        data: Model input data containing spine, features and target
        splitting_params: Dictionary with splitting configuration

    Returns:
        A new pandas dataframe with a fold and split column attached
    """
    iteration_col_name = splitting_params.get("iteration_col_name", "iteration_id")
    split_col_name = splitting_params.get("split_col_name", "split")

    data[iteration_col_name] = 0
    data[split_col_name] = "LIVE"

    return data

In [None]:
# splitting_params = {'splitter': 'customerone.pipelines.telco.test_train_splitter.OutOfTimeSplitter', 
#                     'splitter_args': {'n_splits': 1, 
#                                       'test_size': 0.2, 
#                                       'date_col': 'current_dt', 
#                                       'verbose': 1, 
#                                       'random_state': 42}, 
#                     'group_col_name': 'customer_id', 
#                     'iteration_col_name': 'iteration_id', 
#                     'split_col_name': 'split'}

inference_master_table_live = attach_live_iter_split_col(inference_master_table, splitting_params)
inference_master_table_live

In [None]:
# node(
#     func=attach_live_iter_split_col,
#     inputs={
#         "data": f"{model_name}.in_memory.inference_master_table",
#         "splitting_params": f"{param_str}.splitting_params",
#     },
#     outputs=f"{model_name}.inference.inference_master_table",
#     name=f"add_iteration_id",
#     tags=["inference: CDL to edge"],
# )

# inference predictions 

In [None]:
# predictions = get_model_predictions(split_train_test_cal, calibrated_model, splitting_params, inference_params, target)


get_model_predictions(inference_master_table_live, calibrated_model, splitting_params, inference_params, target)


In [None]:
                node(
                    func=get_model_predictions,
                    inputs={
                        "data": f"{model_name}.inference.inference_master_table",
                        "trained_models": f"{model_name}.in_memory.inf_model",
                        "splitting_params": f"{param_str}.splitting_params",
                        "inference_params": f"{param_str}.inference_params",
                    },
                    outputs=f"{model_name}.in_memory.inf_predictions",
                    name=f"prediction",
                    tags=["inference: predictions"],
                ),
                node(
                    func=add_run_id,
                    inputs={
                        "data": f"{model_name}.in_memory.inf_predictions",
                        "run_id": f"{model_name}.training.run_id",
                    },
                    outputs=f"{model_name}.inference.inf_predictions",
                    name="add_run_id_to_predictions",
                    tags=["inference: predictions"],
                )