#### Module Summary

> This module will be used for scoring a data using the logit model post implementation. The module will take care of the data preprocessing and data conversion before scoring
>
> **Input Files**
> 1. Scoring Data (csv)
> 2. Scoring pickle file - model metadata (pickle)
>
> **Output Files**
> Scored Output with ML Score (csv)

<h4> 5.1 Import Modules and Parameters

In [1]:
import pandas as pd, numpy as np
import warnings, time, pickle, math, statistics, os
import logit_config as cfg
from pdLogit.data_processing import *

warnings.filterwarnings("ignore")

# Location Parameters
wpath = cfg.wpath
fpath = wpath + '/data'
outpath = wpath + '/out'

# Data Related Parameters
score_fname = 'test_data.csv'
resp_var = cfg.resp_var
id_varlist = cfg.id_varlist
drop_varlist = cfg.drop_varlist
non_pred_varlist = id_varlist + [resp_var]

<h4> 5.2 Import Data

In [6]:
# Import Scoring Data
scoring_data = pd.read_csv(f'{fpath}/{score_fname}')
print(f'Scoring Data Shape: {scoring_data.shape}')

Scoring Data Shape: (228, 32)


In [7]:
# Read Model Metadata
with open(f"{outpath}/model_metadata.pickle", 'rb') as f:
    pickle_dict = pickle.load(f)
dev_dtypes_dict = pickle_dict['dev_dtypes_dict']
imp_df = pickle_dict['imp_df']
c_class_df = pickle_dict['c_class_df']
f_class_df = pickle_dict['f_class_df']
logit_model_obj = pickle_dict['model']
model_approach = pickle_dict['model_approach']

if model_approach == 'dummy_vars':
    d_ord_enc = pickle_dict['d_ord_enc']
    d_oh_enc = pickle_dict['d_oh_enc']

<h4> 5.3 Data Pre-processing

In [8]:
# Remove Leading and Trailing Blanks
scoring_data = pre_process_data(scoring_data, drop_varlist)

# Get Model Variables
if model_approach == 'woe':
    model_varlist = list(set([x[2:] for x in logit_model_obj.params.index.tolist() if x != 'const']))
elif model_approach == 'dummy_vars':
    model_varlist = list(set([x.replace('_'+x.split('_')[-1], '')[2:] for x in logit_model_obj.params.index.tolist() if x != 'const']))
    
# Convert Boolean Variables to Character
bool_varlist = [x for x in scoring_data.columns if scoring_data[x].dtypes.kind == 'b']
for var in bool_varlist:
    scoring_data[var] = scoring_data[var].astype(str)
    
# Create Variable Lists
char_varlist = [x for x in model_varlist if dev_dtypes_dict[x] == object]
num_varlist = [x for x in model_varlist if x not in char_varlist]

# Keep Required Columns only
keep_varlist = id_varlist + model_varlist
scoring_X = scoring_data[keep_varlist]

print(f'Scoring Data Shape: {scoring_X.shape}')

Scoring Data Shape: (228, 10)


In [9]:
# Replace New Values with Missing
for col in char_varlist:
    scoring_X.loc[~scoring_X[col].isin(c_class_df[c_class_df['VAR_NAME'] == col]['VAR_BINS']), col] = np.nan

# Copy Dev Data Types to Scoring Data
# scoring_X = copy_dtypes(dev_dtypes_dict, scoring_X)
scoring_X = scoring_X.astype({key: dev_dtypes_dict[key] for key in scoring_X.columns})

# Missing Value Imputation
scoring_X = impute_missing_values(scoring_X, imp_df)

In [10]:
# Numeric Variables
f_class_df = f_class_df[['VAR_NAME', 'VAR_BINS', 'LN_ODDS', 'bin_left', 'bin_right']]
for col in num_varlist:
    
    var_df = f_class_df[f_class_df['VAR_NAME'] == col]
    
    # Create Cutpoints List
    cutpoints = var_df['bin_left'].tolist() + var_df['bin_right'].tolist()
    cutpoints = list(set(cutpoints))
    cutpoints.sort()
    
    # Update Original Variable with Bin Value - Development
    scoring_X[col] = scoring_X[col].astype(float)
    scoring_X['var_bin_lat'] = pd.cut(scoring_X[col], cutpoints, right=True, labels=None, retbins=False, precision=10, include_lowest=False)
    scoring_X.drop(col, axis=1, inplace=True)
    scoring_X.rename(columns={'var_bin_lat': col}, inplace=True)

In [11]:
# Character Variables
c_class_df = c_class_df[c_class_df['VAR_NAME'].isin(char_varlist)]
for col in char_varlist:
    var_df = c_class_df[c_class_df['VAR_NAME'] == col][['VAR_BINS', 'FINE_BIN_NUM']]
    scoring_X[col] = scoring_X[col].astype(str)
    scoring_X = scoring_X.merge(var_df, left_on=col, right_on='VAR_BINS', how='left')
    scoring_X.drop([col, 'VAR_BINS'], axis=1, inplace=True)
    scoring_X.rename(columns={'FINE_BIN_NUM': col}, inplace=True)

<h4> 5.4 WOE Value Conversion

In [12]:
if model_approach == 'woe':
    
    # Replace Original Data with WOE Values
    for col in model_varlist:
        
        _woe_df = f_class_df[f_class_df['VAR_NAME'] == col][['VAR_BINS', 'LN_ODDS']]
        _woe_df['VAR_BINS'] = _woe_df['VAR_BINS'].astype(str)
        scoring_X[col] = scoring_X[col].astype(str)
        scoring_X = scoring_X.merge(_woe_df, left_on=col, right_on='VAR_BINS', how='left')
        scoring_X.drop([col, 'VAR_BINS'], axis=1, inplace=True)
        scoring_X.rename(columns={'LN_ODDS': 'L_'+col}, inplace=True)
        
    model_varlist_L = ['L_'+x for x in model_varlist]
    scoring_X_enc_df = scoring_X[model_varlist_L]
    
    # Check for Missing Values
    nmiss_score_df = pd.DataFrame(scoring_X_enc_df.isnull().sum().rename('nmiss').rename_axis('feature')).reset_index()
    nmiss_score = nmiss_score_df[nmiss_score_df['nmiss'] > 0].index.size
    print(f'Variables having Missing Values: {nmiss_score}')

Variables having Missing Values: 0


<h4> 5.5 Dummy Variable Creation

In [None]:
if model_approach == 'dummy_vars':
    
    varList = [x for x in scoring_X.columns if x not in non_pred_varlist]
    
    # Ordinal Encoder Preprocessor
    d_ord_enc = {k: v for k, v in d_ord_enc.items() if k in scoring_X.columns}
    scoring_X[list(d_ord_enc.keys())] = pd.DataFrame({col: scoring_X[col].map(d_ord_enc[col]).fillna(statistics.mode(d_ord_enc[col].values())) for col in d_ord_enc.keys()})
    
    # One-Hot Encoding to Create Dummy Variables
    oh_extra_cols_df = pd.DataFrame({x: ['d0']*scoring_X.index.size for x in d_oh_enc.feature_names_in_ if x not in scoring_X.columns})
    oh_input_df = pd.concat([scoring_X, oh_extra_cols_df], axis=1).drop(id_varlist, axis=1)[list(d_oh_enc.feature_names_in_)]
    scoring_X_enc_df_all = pd.DataFrame(d_oh_enc.transform(oh_input_df).toarray())
    scoring_X_enc_df_all.columns = ['L_'+x for x in d_oh_enc.et_feature_names_out().tolist()]
    
    # Finalise Model Variables
    model_varlist = [x for x in logit_model_obj.params.index.tolist() if x != 'const']
    scoring_X_enc_df = scoring_X_enc_df_all[model_varlist]
    
    # Check for Missing Values
    nmiss_score_df = pd.DataFrame(scoring_X_enc_df.isnull().sum().rename('nmiss').rename_axis('feature')).reset_index()
    nmiss_score = nmiss_score_df[nmiss_score_df['nmiss'] > 0].index.size
    print(f'Variables having Missing Values: {nmiss_score}')

<h4> 5.6 Scoring

In [13]:
# Add Intercept Column
scoring_X_enc_df['const'] = 1

# Score
scoring_X_enc_df['Prediction'] = logit_model_obj.predict(scoring_X_enc_df)
scoring_X_enc_df['odds'] = scoring_X_enc_df['Prediction'].apply(lambda x: x/(1-x) if x < 1 else x/(1-x+0.00001))
scoring_X_enc_df['scaled_score'] = scoring_X_enc_df['odds'].apply(lambda x: int(np.round(500-30*(math.log10(100)/math.log10(2))-(30*math.log10(x)/math.log10(2)))))
scoring_X_enc_df = scoring_X_enc_df.drop(logit_model_obj.params.index.tolist(), axis=1)

scoring_data_out = pd.concat([scoring_data[id_varlist], scoring_X_enc_df], axis=1)
scoring_data_out.head()

Unnamed: 0,id,Prediction,odds,scaled_score
0,842517.0,0.249749,0.332888,348
1,84300903.0,0.978462,45.429622,136
2,84348301.0,0.96014,24.087493,163
3,84358402.0,0.980711,50.841791,131
4,843786.0,0.043532,0.045513,434
