## Imports

In [173]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [174]:
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# impute missing values
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler

In [175]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, string_cleaning,
    get_package_path, display_as_tabs, save_pipeline, load_pipeline, initialize_environment,
    load_dataset, save_dataset, pd_read_from_gs, DEFAULT_ARTIFACTS_PATH, setanalyse
)
from ta_lib.core.io import convert_delta_to_pandas
from ta_lib.core.api import tracker, start_experiment, update_data_run_id, init_mlflow
import ta_lib.eda.api as eda
from xgboost import XGBRegressor
from ta_lib.regression.api import SKLStatsmodelOLS
from sklearn.ensemble import RandomForestClassifier
from ta_lib.regression.api import RegressionComparison, RegressionReport
from ta_lib.classification.api import ClassificationComparison, ClassificationReport, confusion_matrix_by_feature, SKLStatsmodelLogit 
# from ta_lib.classification.api import metrics_classfication
import ta_lib.reports.api as reports
from ta_lib.data_processing.api import Outlier,WoeBinningTransformer 

initialize_environment(debug=False, hide_warnings=True)

# Initialization

In [176]:
artifacts_folder = DEFAULT_ARTIFACTS_PATH

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [177]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
#Rabi/scoring/notebooks/python/Feature Catalog - features_decision.csv
decision_feature_path = op.join(os.getcwd(),'Feature Catalog - features_decision.csv')
vif_feature_path = op.join(os.getcwd(),'vif_features.csv')

In [178]:
index_keys = ['pop_prospect_household_id','pop_prospect_household_member_no','pop_cust_id']
ignore_cols = ['train_test_ind']
target_col = [f"target_{context.config['model_dev']['target']}"][0]

In [179]:
effort_id = context.config["model_dev"]['effort_id']
usecase = context.config["model_dev"]['use_case']
cohort = context.config["model_dev"]['cohort']
target = context.config['model_dev']['target']
selection_dt = context.config['model_dev']['selection_date']
run_date = context.config['model_dev']['run_date']

In [180]:
def feature_dec_variables(file_path):
    df_var=pd.read_csv(file_path)
    df_var['feature_name'] = df_var['table']+'_'+df_var['feature'].str.replace('-','_').str.replace('_+','_').str.lower()
    dec_dict = dict(df_var[df_var.use_flag==1][['feature_name','dtype']].values)
    return dec_dict

### ONLY FOR VIF FEATURES EXECUTE THE BELOW CODE

In [181]:
def vif_features_variables(file_path):
    df_var=pd.read_csv(file_path)
    df_var['feature_name'] = df_var['variables'].str.replace('-','_').str.replace('_+','_').str.lower()
    dec_dict = list(df_var[df_var.use_flag==1]['feature_name'].values)
    return dec_dict

## Experiment Tracker (ML Flow)

In [343]:
exp_name=context.config["model_dev"]["expt_name"].format(use_case=usecase,cohort=cohort,target=target)
run_name=context.config["model_dev"]["run_name"]
artifact_store=context.config["model_tracker"]["artifact_store"]["uri"]
mlflow_uri=context.config["model_tracker"]["backend_store"]["uri"]

client, expt_id = init_mlflow(exp_name, mlflow_uri, artifact_store)
with tracker.start_run(experiment_id=expt_id,run_name = run_name) as run:
    run_info = run.to_dictionary()
    run_id = run_info["info"]["run_id"]
    print(run_id)
    for key_ in context.config["model_dev"]:
        try:
            key_, value_ = key_, context.config["model_dev"][key_].format(**context.config["model_dev"], run_id=run_id)
        except Exception as e:
            key_, value_ = key_, context.config["model_dev"][key_]
        tracker.log_param(key_, value_)

5bf280668a12456d9bb4b2d61c5360b0


In [344]:
tracker.start_run(run_id=run_id, experiment_id=expt_id, run_name=run_name)

<ActiveRun: >

In [345]:
dec_dict = vif_features_variables(vif_feature_path)
feat_dec = feature_dec_variables(decision_feature_path)

## 3.1 Read the Train and Test Data

In [346]:
print(effort_id,usecase,cohort,target)

20704 uc2_postcards active opt


In [270]:
train_test_sampled = load_dataset(context,'model_data',effort_id=effort_id,use_case=usecase,run_date=run_date,cohort=cohort,target=target)#convert_delta_to_pandas(context.config["model_dev"]['gcs_bucket_name'], context.config["data_catalog"]['datasets']["train"]["uri"].format(**context.config["model_dev"]),"/home/jupyter/pch/notebooks/python/raj")

In [271]:
scoring_on_validation = context.config["model_dev"]["scoring_on_validation"]
if scoring_on_validation:
    validation_selection_date = context.config["model_dev"]["validation_selection_date"]
    validation_effort_id = context.config["model_dev"]["validation_effort_id"]
    validation = load_dataset(context,'validation_data',effort_id=validation_effort_id,use_case=usecase,run_date=run_date,cohort=cohort,target=target)

Invalid arguments provided - {'target'} :: Ignoring them ..


In [347]:
train_test_sampled_clean = (
    train_test_sampled
    # while iterating on testing, it's good to copy the dataset(or a subset)
    # as the following steps will mutate the input dataframe. The copy should be
    # removed in the production code to avoid introducing perf. bottlenecks.
    .copy()
    #.change_type('customer_demo_cust_state_province_cd_first',str)
    # set dtypes : nothing to do here
    .passthrough()

    .clean_names(remove_special=True)
)

In [348]:
if scoring_on_validation:
    print("Validation")
    validation_clean = (
        validation
        # while iterating on testing, it's good to copy the dataset(or a subset)
        # as the following steps will mutate the input dataframe. The copy should be
        # removed in the production code to avoid introducing perf. bottlenecks.
        .copy()
        #.change_type('customer_demo_cust_state_province_cd_first',str)
        # set dtypes : nothing to do here
        .passthrough()

        .clean_names(remove_special=True)
    )

Validation


In [None]:
online_features=set()
for col in feat_dec.keys():
    if col.startswith("online"):
        online_features.add(col)

In [349]:
train_test_sampled_clean["customer_demo_gender_cd_first"].value_counts()

f    50415
m    12023
u     2375
Name: customer_demo_gender_cd_first, dtype: int64

In [350]:
train_test_sampled_clean["customers_online_gender_first"].value_counts()

f       37150
m        8911
_ukn     3187
Name: customers_online_gender_first, dtype: int64

In [351]:
train_test_sampled_clean["prospects_gender_cd_first"].value_counts()

f       51337
m       12102
u        4474
_ukn     3108
0        1568
Name: prospects_gender_cd_first, dtype: int64

In [352]:
list_cols=list(train_test_sampled_clean.columns)
train_test_sampled_clean['selection_dt'] = pd.to_datetime(selection_dt)
if 'customer_demo_custid_create_dt_first' in list_cols:
    train_test_sampled_clean['tenure'] = ((train_test_sampled_clean['selection_dt'] - pd.to_datetime(train_test_sampled_clean['customer_demo_custid_create_dt_first'],infer_datetime_format=True)).dt.days).apply(lambda x: max(x,0))

if 'prospects_first_effort_dt_first' in list_cols:
    train_test_sampled_clean['days_since_first_effort'] =((train_test_sampled_clean['selection_dt'] - pd.to_datetime(train_test_sampled_clean['prospects_first_effort_dt_first'],infer_datetime_format=True)).dt.days).apply(lambda x: max(x,0))
    
if "customer_demo_gender_cd_first" in list_cols:
    train_test_sampled_clean["gender"] = train_test_sampled_clean['customer_demo_gender_cd_first'] 
    train_test_sampled_clean.loc[train_test_sampled_clean["gender"].isin(["m","f","u"])==False, "gender"] = np.nan
    train_test_sampled_clean.drop("customer_demo_gender_cd_first", axis=1, inplace=True)
    if "customers_online_gender_first" in list_cols:
        train_test_sampled_clean["gender"] = train_test_sampled_clean["gender"].fillna(train_test_sampled_clean['customers_online_gender_first'])
        train_test_sampled_clean.loc[train_test_sampled_clean["gender"].isin(["m","f","u"])==False, "gender"] = np.nan
        train_test_sampled_clean.drop("customers_online_gender_first", axis=1, inplace=True)
        if "prospects_gender_cd_first" in list_cols:
            train_test_sampled_clean["gender"] = train_test_sampled_clean["gender"].fillna(train_test_sampled_clean['prospects_gender_cd_first'])
            train_test_sampled_clean.drop("prospects_gender_cd_first", axis=1, inplace=True)
            
train_test_sampled_clean.loc[train_test_sampled_clean["gender"].isin(["m","f","u"])==False, "gender"] = "_ukn"

train_test_sampled_clean.drop(['customer_demo_custid_create_dt_first','prospects_first_effort_dt_first'],axis=1,inplace=True)

In [353]:
if scoring_on_validation:
    list_cols=list(validation_clean.columns)
    validation_clean['selection_dt'] = pd.to_datetime(context.config["model_dev"]["validation_selection_date"])
    if 'customer_demo_custid_create_dt_first' in list_cols:
        validation_clean['tenure'] = ((validation_clean['selection_dt'] - pd.to_datetime(validation_clean['customer_demo_custid_create_dt_first'],infer_datetime_format=True)).dt.days).apply(lambda x: max(x,0))
    
    if 'prospects_first_effort_dt_first' in list_cols:
        validation_clean['days_since_first_effort'] =((validation_clean['selection_dt'] - pd.to_datetime(validation_clean['prospects_first_effort_dt_first'],infer_datetime_format=True)).dt.days).apply(lambda x: max(x,0))
    
    if "customer_demo_gender_cd_first" in list_cols:
        validation_clean["gender"] = validation_clean['customer_demo_gender_cd_first'] 
        validation_clean.loc[validation_clean["gender"].isin(["m","f","u"])==False, "gender"] = np.nan
        validation_clean.drop("customer_demo_gender_cd_first", axis=1, inplace=True)
        if "customers_online_gender_first" in list_cols:
            validation_clean["gender"] = validation_clean["gender"].fillna(validation_clean['customers_online_gender_first'])
            validation_clean.loc[validation_clean["gender"].isin(["m","f","u"])==False, "gender"] = np.nan
            validation_clean.drop("customers_online_gender_first", axis=1, inplace=True)
            if "prospects_gender_cd_first" in list_cols:
                validation_clean["gender"] = validation_clean["gender"].fillna(validation_clean['prospects_gender_cd_first'])
                validation_clean.drop("prospects_gender_cd_first", axis=1, inplace=True)

    validation_clean.loc[validation_clean["gender"].isin(["m","f","u"])==False, "gender"] = "_ukn"

    validation_clean.drop(['customer_demo_custid_create_dt_first','prospects_first_effort_dt_first'],axis=1,inplace=True)

In [354]:
train_test_sampled_clean["gender"].value_counts()

f       64443
_ukn    15746
m       15013
u        4405
Name: gender, dtype: int64

In [355]:
len(online_features),len(dec_dict)

(85, 933)

In [356]:
imp_feature=set(dec_dict)

In [357]:
len(imp_feature)

933

In [358]:
len(imp_feature)

933

In [359]:
feature_cols = list(setanalyse(list(train_test_sampled_clean.columns),list(imp_feature),simplify=False)['A^B'])
#feature_cols = feature_cols + ["tenure", "days_since_first_effort", "gender"]

In [360]:
train_test_sampled_clean.shape

(99607, 3457)

In [361]:
train_test_sampled_clean[target_col].value_counts(normalize=True)

0    0.969852
1    0.030148
Name: target_opt, dtype: float64

In [362]:
validation_clean.fillna(0,inplace=True)

In [363]:
if scoring_on_validation:
    print(validation_clean.shape)
    print(validation_clean[target_col].value_counts(normalize=True))

(31151, 3485)
0.0    0.927193
1.0    0.072807
Name: target_opt, dtype: float64


In [364]:
train = train_test_sampled_clean[train_test_sampled_clean.train_test_ind=='train']
train_X = train.drop(ignore_cols+[target_col],axis=1)

#select features
train_X = train_X
train_y = train[index_keys+[target_col]]

In [365]:
test = train_test_sampled_clean[train_test_sampled_clean.train_test_ind=='test']
test_X = test.drop(ignore_cols+[target_col],axis=1)

test_X = test_X
test_y = test[index_keys+[target_col]]

In [366]:
outlier_transformer = Outlier(method='median') #we can use percentile, actual value, median methods as well.
print(train_X.shape)
train_X = outlier_transformer.fit_transform(train_X)
print(train_X.shape)

(79740, 3455)
(79740, 3455)


In [367]:
train_X.set_index(index_keys,inplace=True)
train_y.set_index(index_keys,inplace=True)

In [368]:
test_X.set_index(index_keys,inplace=True)
test_y.set_index(index_keys,inplace=True)

In [369]:
if scoring_on_validation:
    valid_X = validation_clean.drop([target_col],axis=1)
    valid_y = validation_clean[index_keys+[target_col]]
    valid_X.set_index(index_keys,inplace=True)
    valid_y.set_index(index_keys,inplace=True)

In [370]:
# null with 0
# pay given order -9999 or something
# tru tar and ohe

## Ignoring 95% null values

###Flagging the sparse variables

In [371]:
#train_X = train_X.drop(nulls_95_cols,axis=1, errors='ignore')
#test_X = test_X.drop(nulls_95_cols,axis=1,errors='ignore')

In [372]:
#only for inactive
#feature_cols = list(set(feature_cols).difference(set(days_since_features).union(set(rfm_features))))

In [373]:
len(feature_cols)

931

In [374]:
train_X=train_X[feature_cols]
test_X=test_X[feature_cols]

In [375]:
if scoring_on_validation:
    valid_X = valid_X[feature_cols]

In [376]:
# collecting different types of columns for transformations
cat_columns = train_X.select_dtypes('object').columns
num_columns = train_X.select_dtypes('number').columns

In [377]:
cat_columns

Index(['gender'], dtype='object')

In [378]:
#only for inactive
import re
days_since_features = [i for i in num_columns if 'days_since' in i or 'days_between' in i]
days_since_features.append('tenure')
rfm_features = [i for i in num_columns if re.search('total|last|pr.[0-9]{1,2}',i) or 'growth' in i]
cust_id_cols = [i for i in num_columns if 'cust_id' in i]
remaining_num = list(set(num_columns) - (set(days_since_features).union(rfm_features).union(cust_id_cols)))

In [379]:
cat_columns = ['gender']

In [380]:
#cat_columns = ['gender']

In [381]:
cat_col_counts = []
for col in cat_columns:
    temp_df = pd.concat([train_X[col].value_counts(), train_X[col].value_counts(normalize=True)], axis=1)
    temp_df.reset_index(inplace=True)
    temp_df.columns = ["level", "#", "%"]
    cat_col_counts.append((col, temp_df))
display_as_tabs(cat_col_counts)

In [306]:
#cat_columns = cat_columns + ["customers_online_age_range_first", "gender", "customers_online_registration_type_first"]

In [382]:
len(online_features)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


85

In [383]:
#enc_columns = cat_columns+remaining_num

tgt_enc = Pipeline([
    ('target_encoding', TargetEncoder(return_df=False)),
])

one_hot_enc = Pipeline([
    ('onehot_encoding', OneHotEncoder(drop = 'first')),
])

scaler = Pipeline([
    ('scaler', MinMaxScaler()),
])



features_transformer = ColumnTransformer([
     
    #('remaining',SimpleImputer(strategy='constant',fill_value = '_ukn'),['customer_demo_cust_state_province_cd_first']),
    ## numeric columns
    ('rfm_features', SimpleImputer(strategy='constant',fill_value=0), rfm_features),
    
    ('days', SimpleImputer(strategy='constant',fill_value= -1440), days_since_features),
    
    ('tgt_enc_sim_impt', tgt_enc, list(set(cat_columns))),
    
    #('min_max_scaling' , scaler , rfm_features + days_since_features )
    
])


In [384]:
train_X = get_dataframe(
    features_transformer.fit_transform(train_X, train_y), 
    get_feature_names_from_column_transformer(features_transformer)
)

In [385]:
# As part of EDA analysis this can be modified accordingle
curated_columns = list(set(train_X.columns)-set(ignore_cols))

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [386]:
# save the feature pipeline
save_pipeline(features_transformer, op.abspath(op.join(artifacts_folder, f'01_features_{run_id}.joblib')))

# 4 Modelling

In [387]:
# Custom Transformations like these can be utilised
def _custom_data_transform(df, cols2keep=None):
    """Transformation to drop some columns in the data
    
    Parameters
    ----------
        df - pd.DataFrame
        cols2keep - columns to keep in the dataframe
    """
    cols2keep = cols2keep or []
    if len(cols2keep):
        return (df
                .select_columns(cols2keep))
    else:
        return df

In [388]:
cols_select=FunctionTransformer(_custom_data_transform, kw_args={'cols2keep':curated_columns})
train_X=cols_select.fit_transform(train_X)

In [389]:
save_pipeline(cols_select, op.abspath(op.join(artifacts_folder, f'02_curated_columns_{run_id}.joblib')))

In [390]:
train_X.shape

(79740, 931)

###FEATURE SELECTION FROM RANDOM FOREST MODEL

In [391]:
fe_pipeline = RandomForestClassifier(max_depth = 5 , random_state = 0).fit(train_X,train_y)
importance = fe_pipeline.feature_importances_
feat_imp=pd.DataFrame({'features': train_X.columns,'importances':importance})
top100=feat_imp.nlargest(100,'importances')
selected_features_cols = top100.features.tolist()

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


In [392]:
train_X = train_X[selected_features_cols]


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


### OTHER SELECTION METHODS

In [393]:
# Feature Eliminator
#from ta_lib.data_processing.api import FeatureSelector
#from sklearn.linear_model import Lasso
#from sklearn.feature_selection import SelectFromModel
#fe_pipeline = FeatureSelector(Lasso(alpha=0.01, random_state = 0),selection_type="regularization")
#fe_pipeline = FeatureSelector(RandomForestClassifier(max_depth = 5, random_state = 0),selection_type="recursion",selection_params={'forward': False, 'k_features': 100, 'verbose': False})
#fe_pipeline = SelectFromModel(RandomForestClassifier(max_depth = 5 , random_state = 0),max_features = 100)
#fe_pipeline.fit(train_X,train_y)
#train_X = fe_pipeline.transform(train_X)

In [394]:
train_X.shape

(79740, 100)

In [395]:
# cols = list(train_X.columns)
# vif = eda.calc_vif(train_X)
# while max(vif.VIF) > 15:
#     #removing the largest variable from VIF
#     cols.remove(vif[(vif.VIF==vif.VIF.max())].variables.tolist()[0])
#     vif = eda.calc_vif(train_X[cols])

In [396]:
# reg_vars = vif.query('VIF < 15').variables
# reg_vars = list(reg_vars)

In [397]:
# saving the list of relevant columns
save_pipeline(fe_pipeline, op.abspath(op.join(artifacts_folder, f'03_feature_selection_{run_id}.joblib')))

## Validation Transfomations

In [398]:
test_X = get_dataframe(
    features_transformer.transform(test_X), 
    get_feature_names_from_column_transformer(features_transformer)
)
test_X = cols_select.transform(test_X)

#for model based selection
test_X = test_X[selected_features_cols]

#test_X = fe_pipeline.transform(test_X)

In [399]:
#for model based selection
if scoring_on_validation:
    valid_X = get_dataframe(
        features_transformer.transform(valid_X), 
        get_feature_names_from_column_transformer(features_transformer)
    )
    valid_X = cols_select.transform(valid_X)
    valid_X = valid_X[selected_features_cols]

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [400]:
# if scoring_on_validation:
#     valid_X = get_dataframe(
#         features_transformer.transform(valid_X), 
#         get_feature_names_from_column_transformer(features_transformer)
#     )
#     valid_X = cols_select.transform(valid_X)
#     valid_X = fe_pipeline.transform(valid_X)
    

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [401]:
#automl

In [402]:
# from tigerml.automl import AutoML

# classifier_config_dict = {

# #     Classifiers
#     'sklearn.ensemble.RandomForestClassifier': {
#         'n_estimators': [1000, 500, 750],
#         'criterion': ["gini", "entropy"],
#         'max_features': np.arange(0.05, 1.01, 0.05),
#         'min_samples_split': np.arange(50, 600, 50),
#         'min_samples_leaf': np.arange(50, 600, 50),
#         'bootstrap': [True, False],
#         'class_weight': ['balanced']
#     },

#     'sklearn.linear_model.LogisticRegression': {
#         'penalty': ["l1", "l2"],
#         'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
#         'dual': [True, False],
#         'class_weight': ['balanced']
#     },
# }

# clf = AutoML(name='automl_test',
#     x_train=train_X,
#     y_train=train_y[target_col],
#     x_test=test_X,
#     y_test=test_y[target_col],
#     task='classification',
#     data_type='structured',
#     template='Classifier',
#     search_space=classifier_config_dict,generations=2)

# clf.fit()
# print(clf.optimiser.fitted_pipeline_)
# print(clf.optimiser.get_top_n_pipelines(1))
# pipeline_obj=clf.get_trained_pipeline( clf.optimiser.get_top_n_pipelines(1)['pipeline_str'])

In [403]:
## random forest model
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(max_depth=5, random_state=0)
# clf.fit(train_X, train_y)
# save_pipeline(clf, op.abspath(op.join(artifacts_folder, f'04_model_pipeline_rf_{run_id}.joblib')))

In [404]:
def custom_report_classification(model='', train_X='', train_y='', test_X='', test_y='', refit=False, shap=False, remove_metrics=[],remove_plots=[],run_id='',validation=False,name = 'rf'):
    class_linear_report = ClassificationReport(model=model, x_train=train_X, y_train=train_y, x_test= test_X, y_test= test_y, refit=refit)
    for metric in remove_metrics:
        class_linear_report.remove_metric(metric)
    for plot_name in remove_plots:
        class_linear_report.remove_eval_plot(plot_name)
    if validation:
        validation = 'validation'
    else:
        validation = ''
    class_linear_report.get_report(include_shap=shap, file_path=f'classification_model_report_{name}_{validation}_{run_id}')
    return class_linear_report

In [405]:
remove_metrics = ['accuracy','balanced_accuracy','precision','recall']
remove_plots = ['precision_recall','confusion_matrix','threshold_curve']

# class_linear_report=custom_report_classification(model=clf, 
#                              train_X=train_X, 
#                              train_y=train_y, 
#                              test_X=test_X, 
#                              test_y=test_y, 
#                              refit=False,
#                              validation=False,name = 'rf',
#                              remove_metrics=remove_metrics,
#                             remove_plots=remove_plots,
#                             shap=True,
#                             run_id=run_id
#                             )
# if scoring_on_validation:
#     class_linear_report_valid=custom_report_classification(model=clf, 
#                              train_X=train_X, 
#                              train_y=train_y, 
#                              test_X=valid_X, 
#                              test_y=valid_y, 
#                              refit=False,
#                              validation=True,name = 'rf',
#                              remove_metrics=remove_metrics,
#                             remove_plots=remove_plots,
#                             shap=True,
#                             run_id=run_id
#                             )

In [406]:
# yhat_train=clf.predict_proba(train_X)[:,1]#.reshape(-1,1)
# yhat_test=clf.predict_proba(test_X)[:,1]#.reshape(-1,1)
# if scoring_on_validation:
#     yhat_valid=clf.predict_proba(valid_X)[:,1]#.reshape(-1,1)

# from ta_lib.classification.evaluation import metrics_class
# train_metrics=metrics_class(train_y[target_col].values,yhat_train)
# test_metrics=metrics_class(test_y[target_col].values,yhat_test)
# if scoring_on_validation:
#     valid_metrics=metrics_class(valid_y[target_col].values,yhat_valid)
    
# temp=class_linear_report.evaluator.gains_table()
# top3deciles_df=temp['test'][0].nlargest(3,'lift')
# top3deciles_df.index = ['decile_1_test','decile_2_test','decile_3_test']
# top3deciles_test=dict(top3deciles_df['lift'])

# top3deciles_df_train=temp['train'][0].nlargest(3,'lift')
# top3deciles_df_train.index = ['decile_1_train','decile_2_train','decile_3_train']
# top3deciles_train=dict(top3deciles_df_train['lift'])

# if scoring_on_validation:
    
#     temp_valid=class_linear_report_valid.evaluator.gains_table()
#     top3deciles_df_valid=temp_valid['test'][0].nlargest(3,'lift')
#     top3deciles_df_valid.index = ['decile_1_valid','decile_2_valid','decile_3_valid']
#     top3deciles_valid=dict(top3deciles_df_valid['lift'])
#     validation_scored = validation_clean[index_keys]
#     validation_scored[f"{target}_prob"] = yhat_valid
#     validation_scored.to_csv(f'validation_scored_{run_id}.csv',index=False)

In [407]:

# data = valid_y.copy()
# # data['pred'] = yhat_valid
# data.to_csv('rf_valid.csv')


In [408]:
# tracker.log_artifact('/home/jupyter/pch/notebooks/python/rf_valid.csv', 'model_objects')

In [409]:
# if scoring_on_validation:
#     for key_,val_ in valid_metrics.items():
#         tracker.log_metric('rf_'+key_+'_valid',val_)
#     for key_,val_ in top3deciles_valid.items():
#         tracker.log_metric('rf_'+key_+'_valid',val_)
#     tracker.log_metrics(top3deciles_valid)
#     tracker.log_artifact(op.join(os.getcwd(),f'validation_scored_{run_id}.csv'), 'validations_score')

In [410]:
# for key_,val_ in train_metrics.items():
#     tracker.log_metric('rf_'+key_+'_train',val_)

# for key_,val_ in test_metrics.items():
#     tracker.log_metric('rf_'+key_+'_test',val_)

# for key_,val_ in top3deciles_train.items():
#     tracker.log_metric('rf_'+key_,val_)
    
# for key_,val_ in top3deciles_test.items():
#     tracker.log_metric('rf_'+key_,val_)
    

In [411]:
# from sklearn.linear_model import LogisticRegression
# lasso = LogisticRegression(solver = 'saga' , penalty = 'l1' , max_iter = 10000 , C = 0.1 ,random_state = 0 )
# lasso.fit(train_X,train_y)
# save_pipeline(lasso, op.abspath(op.join(artifacts_folder, f'04_model_pipeline_lasso_{run_id}.joblib')))

In [412]:
# class_linear_report=custom_report_classification(model=lasso, 
#                              train_X=train_X, 
#                              train_y=train_y, 
#                              test_X=test_X, 
#                              test_y=test_y, 
#                              refit=False,
#                              validation=False,name = 'lasso',
#                              remove_metrics=remove_metrics,
#                             remove_plots=remove_plots,
#                             shap=True,
#                             run_id=run_id
#                             )
# if scoring_on_validation:
#     class_linear_report_valid=custom_report_classification(model=lasso, 
#                              train_X=train_X, 
#                              train_y=train_y, 
#                              test_X=valid_X, 
#                              test_y=valid_y, 
#                              refit=False,
#                              validation=True,name = 'lasso',
#                              remove_metrics=remove_metrics,
#                             remove_plots=remove_plots,
#                             shap=True,
#                             run_id=run_id
#                             )

In [413]:
# yhat_train=lasso.predict_proba(train_X)[:,1]#.reshape(-1,1)
# yhat_test=lasso.predict_proba(test_X)[:,1]#.reshape(-1,1)
# if scoring_on_validation:
#     yhat_valid=lasso.predict_proba(valid_X)[:,1]#.reshape(-1,1)

# from ta_lib.classification.evaluation import metrics_class
# train_metrics=metrics_class(train_y[target_col].values,yhat_train)
# test_metrics=metrics_class(test_y[target_col].values,yhat_test)
# if scoring_on_validation:
#     valid_metrics=metrics_class(valid_y[target_col].values,yhat_valid)
    
# temp=class_linear_report.evaluator.gains_table()
# top3deciles_df=temp['test'][0].nlargest(3,'lift')
# top3deciles_df.index = ['decile_1_test','decile_2_test','decile_3_test']
# top3deciles_test=dict(top3deciles_df['lift'])

# top3deciles_df_train=temp['train'][0].nlargest(3,'lift')
# top3deciles_df_train.index = ['decile_1_train','decile_2_train','decile_3_train']
# top3deciles_train=dict(top3deciles_df_train['lift'])

# if scoring_on_validation:
    
#     temp_valid=class_linear_report_valid.evaluator.gains_table()
#     top3deciles_df_valid=temp_valid['test'][0].nlargest(3,'lift')
#     top3deciles_df_valid.index = ['decile_1_valid','decile_2_valid','decile_3_valid']
#     top3deciles_valid=dict(top3deciles_df_valid['lift'])
#     validation_scored = validation_clean[index_keys]
#     validation_scored[f"{target}_prob"] = yhat_valid
#     validation_scored.to_csv(f'validation_scored_{run_id}.csv',index=False)

In [414]:
# data = valid_y.copy()
# data['pred'] = yhat_valid
# data.to_csv('lasso_valid.csv')

# tracker.log_artifact('/home/jupyter/pch/notebooks/python/lasso_valid.csv', 'model_objects')

In [415]:
# for key_,val_ in train_metrics.items():
#     tracker.log_metric(key_+'_train',val_)

# for key_,val_ in test_metrics.items():
#     tracker.log_metric(key_+'_test',val_)

# for key_,val_ in top3deciles_train.items():
#     tracker.log_metric(key_,val_)
    
# for key_,val_ in top3deciles_test.items():
#     tracker.log_metric(key_,val_)

In [73]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 3.8 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.1.1


In [416]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
xgb = LGBMClassifier(learning_rate=0.005, max_depth=15, min_child_weight=60,
               num_leaves=10, random_state=42)#xgb = XGBClassifier(n_estimators = 10 , max_depth = 5 , eta = 0.5 , random_state = 42)
xgb.fit(train_X,train_y)
save_pipeline(xgb, op.abspath(op.join(artifacts_folder, f'04_model_pipeline_xgb_{run_id}.joblib')))

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


In [417]:
feature_imp = xgb.feature_importances_
feature_imp=pd.DataFrame({'features': train_X.columns,'importances':xgb.feature_importances_})
#selected_features_cols = top100.features.tolist()
feature_imp.to_csv("important_feature.csv")

In [418]:
yhat_train=xgb.predict_proba(train_X)[:,1]#.reshape(-1,1)
yhat_test=xgb.predict_proba(test_X)[:,1]#.reshape(-1,1)
if scoring_on_validation:
    yhat_valid=xgb.predict_proba(valid_X)[:,1]#.reshape(-1,1)

from ta_lib.classification.evaluation import metrics_class
train_metrics=metrics_class(train_y[target_col].values,yhat_train)
test_metrics=metrics_class(test_y[target_col].values,yhat_test)
if scoring_on_validation:
    valid_metrics=metrics_class(valid_y[target_col].values,yhat_valid)

print('train:',train_metrics)
print('test:',test_metrics)
print('valid:',valid_metrics)

train: {'auc_roc': 0.83, 'auc_prec_recall': 0.18, 'log_loss': 0.12, 'f1_score': 0.0}
test: {'auc_roc': 0.82, 'auc_prec_recall': 0.18, 'log_loss': 0.12, 'f1_score': 0.0}
valid: {'auc_roc': 0.55, 'auc_prec_recall': 0.09, 'log_loss': 0.27, 'f1_score': 0.0}


In [419]:
class_linear_report=custom_report_classification(model=xgb, 
                             train_X=train_X, 
                             train_y=train_y, 
                             test_X=test_X, 
                             test_y=test_y, 
                             refit=False,
                             validation=False,name = 'xgb',
                             remove_metrics=remove_metrics,
                            remove_plots=remove_plots,
                            shap=True,
                            run_id=run_id
                            )
if scoring_on_validation:
    class_linear_report_valid=custom_report_classification(model=xgb, 
                             train_X=train_X, 
                             train_y=train_y, 
                             test_X=valid_X, 
                             test_y=valid_y, 
                             refit=False,
                             validation=True,name = 'xgb',
                             remove_metrics=remove_metrics,
                            remove_plots=remove_plots,
                            shap=True,
                            run_id=run_id
                            )

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
NPY_ARRAY_UPDATEIFCOPY, NPY_ARRAY_INOUT_ARRAY, and NPY_ARRAY_INOUT_FARRAY are deprecated, use NPY_WRITEBACKIFCOPY, NPY_ARRAY_INOUT_ARRAY2, or NPY_ARRAY_INOUT_FARRAY2 respectively instead, and call PyArray_ResolveWritebackIfCopy before the array is deallocated, i.e. before the last call to Py_DECREF.
UPDATEIFCOPY detected in array_dealloc.  Required call to PyArray_ResolveWritebackIfCopy or PyArray_DiscardWritebackIfCopy is missing
2021-01-27 10:34:12 - ERROR - tigerml.model_eval.plotters.interpretation::inner::169 - Cannot compute get_feature_importances. Error - No loop matching the specified signature and casting was found for ufunc true_divide
Traceback (most recent call last):
  File "/home/jupyter/Rabi/pch/src/ta_lib/_vendor/tigerml/core/utils/_l

                                                

Nesting Layouts within a HoloMap makes it difficult to access your data or control how it appears; we recommend calling .collate() on the HoloMap in order to follow the recommended nesting structure shown in the Composing Data user guide (https://goo.gl/2YS8LJ)


                                               

NPY_ARRAY_UPDATEIFCOPY, NPY_ARRAY_INOUT_ARRAY, and NPY_ARRAY_INOUT_FARRAY are deprecated, use NPY_WRITEBACKIFCOPY, NPY_ARRAY_INOUT_ARRAY2, or NPY_ARRAY_INOUT_FARRAY2 respectively instead, and call PyArray_ResolveWritebackIfCopy before the array is deallocated, i.e. before the last call to Py_DECREF.
UPDATEIFCOPY detected in array_dealloc.  Required call to PyArray_ResolveWritebackIfCopy or PyArray_DiscardWritebackIfCopy is missing
2021-01-27 10:34:35 - ERROR - tigerml.model_eval.plotters.interpretation::inner::169 - Cannot compute get_feature_importances. Error - No loop matching the specified signature and casting was found for ufunc true_divide
Traceback (most recent call last):
  File "/home/jupyter/Rabi/pch/src/ta_lib/_vendor/tigerml/core/utils/_lib.py", line 167, in inner
    return func(*args, **kwargs)
  File "/home/jupyter/Rabi/pch/src/ta_lib/_vendor/tigerml/model_eval/plotters/interpretation.py", line 822, in get_feature_importances
    feature_importances_, features_ = vizer.

                                                

Nesting Layouts within a HoloMap makes it difficult to access your data or control how it appears; we recommend calling .collate() on the HoloMap in order to follow the recommended nesting structure shown in the Composing Data user guide (https://goo.gl/2YS8LJ)


                                               

In [420]:
temp=class_linear_report.evaluator.gains_table()
top3deciles_df=temp['test'][0].nlargest(3,'lift')
top3deciles_df.index = ['decile_1_test','decile_2_test','decile_3_test']
top3deciles_test=dict(top3deciles_df['lift'])

top3deciles_df_train=temp['train'][0].nlargest(3,'lift')
top3deciles_df_train.index = ['decile_1_train','decile_2_train','decile_3_train']
top3deciles_train=dict(top3deciles_df_train['lift'])

if scoring_on_validation:
    
    temp_valid=class_linear_report_valid.evaluator.gains_table()
    top3deciles_df_valid=temp_valid['test'][0].nlargest(3,'lift')
    top3deciles_df_valid.index = ['decile_1_valid','decile_2_valid','decile_3_valid']
    top3deciles_valid=dict(top3deciles_df_valid['lift'])
    validation_scored = validation_clean[index_keys]
    validation_scored[f"{target}_prob"] = yhat_valid
    validation_scored.to_csv(f'validation_scored_{run_id}.csv',index=False)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [421]:
train_data = train_X.copy()
train_data['predicted_value'] = yhat_train
train_data['actual_value'] = np.concatenate(train_y.values)

test_data = test_X.copy()
test_data['predicted_value'] = yhat_test
test_data['actual_value'] = np.concatenate(test_y.values)

valid_data = valid_X.copy()
valid_data['predicted_value'] = yhat_valid
valid_data['actual_value'] = np.concatenate(valid_y.values)

train_data.to_csv(f'train_data_{run_id}.csv',index=False)
test_data.to_csv(f'test_data_{run_id}.csv',index=False)
if scoring_on_validation:
    valid_data.to_csv(f'valid_data_{run_id}.csv',index=False)

In [422]:
tracker.log_artifact(op.join(os.getcwd(),f'train_data_{run_id}.csv'), 'validations_score')
tracker.log_artifact(op.join(os.getcwd(),f'test_data_{run_id}.csv'), 'validations_score')
tracker.log_artifact(op.join(os.getcwd(),f'valid_data_{run_id}.csv'), 'validations_score')

In [423]:
for key_,val_ in train_metrics.items():
    tracker.log_metric(key_+'_train',val_)

for key_,val_ in test_metrics.items():
    tracker.log_metric(key_+'_test',val_)

for key_,val_ in top3deciles_train.items():
    tracker.log_metric(key_,val_)
    
for key_,val_ in top3deciles_test.items():
    tracker.log_metric(key_,val_)


In [424]:
top3deciles_train

{'decile_1_train': 4.425216316440049,
 'decile_2_train': 3.250927070457355,
 'decile_3_train': 2.6260129103145173}

In [425]:
top3deciles_test

{'decile_1_test': 4.532847117321248,
 'decile_2_test': 3.142677482730654,
 'decile_3_test': 2.5116162145786722}

In [426]:
if scoring_on_validation:
    print(valid_metrics)
    print(top3deciles_valid)

{'auc_roc': 0.55, 'auc_prec_recall': 0.09, 'log_loss': 0.27, 'f1_score': 0.0}
{'decile_1_valid': 1.6490829207255102, 'decile_2_valid': 1.1985981478178729, 'decile_3_valid': 1.1609090232609303}


## Experiment Loging and Finishing (ML Flow)

In [427]:
# artifacts logging
files = [i for i in os.listdir(artifacts_folder) if i.endswith('joblib') and run_id in i]
reports = [i for i in os.listdir(os.getcwd()) if i.endswith('html') and run_id in i]
tracker.log_artifact(op.join(os.getcwd(),f'important_feature.csv'), 'feature importance')
for file_ in files:
    tracker.log_artifact(op.join(artifacts_folder,file_), 'model_objects')

for report_ in reports:
    tracker.log_artifact(op.join(os.getcwd(),report_), 'reports')

if scoring_on_validation:
    for key_,val_ in valid_metrics.items():
        tracker.log_metric(key_+'_valid',val_)
    tracker.log_metrics(top3deciles_valid)
    tracker.log_artifact(op.join(os.getcwd(),f'validation_scored_{run_id}.csv'), 'validations_score')

In [428]:
tracker.end_run(status='FINISHED')

In [262]:
#Rabi/scoring/notebooks/python/04_model_pipeline_xgb_4d91f2f463624da3af0b8414b1e8e88a (1).joblib
model_path=op.join(os.getcwd(),'04_model_pipeline_xgb_4d91f2f463624da3af0b8414b1e8e88a (1).joblib')

In [264]:
import joblib
mo=joblib.load(model_path)

In [265]:
mo.get_params

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


<bound method LGBMModel.get_params of LGBMClassifier(learning_rate=0.005, max_depth=15, min_child_weight=60,
               num_leaves=10, random_state=42)>