#### Module Summary
> This module does fine classing of the user edited coarse classing data and creates the summarised fine classing output. Also converts the original development and validation data to their respective WOE bucket values or dummay variables
> 
> **Input Files**
> 1. Developemnt and Validation data with reduced features as obtained form 02_coarse_classing module (pickle)
> 2. User modified coarse classin data (xlsx)
> 3. Feature Readuction Summary (xlsx)
>
> **Output Files**
> 1. Development & Validation data with WOE values/ dummy features (pickle)
> 2. Model metadata - will be required for scoring (pickle)
> 3. Fine Classing Data (xlsx)

<h4> 3.1 Import Modules and Parameters

In [1]:
import logit_config as cfg
exec(open('module_imports.py').read())

<h4> 3.2 Import Data

In [2]:
# Import Model Data Object
with open(f"{outpath}/model_data.pickle", 'rb') as f:
    model_data_pckl_dict = pickle.load(f)
dev_data = copy.deepcopy(model_data_pckl_dict['dev_data_out2'])
val_data = copy.deepcopy(model_data_pckl_dict['val_data_out2'])

# Import Model MetaData Object
with open(f"{outpath}/model_metadata.pickle", 'rb') as f:
    model_metadata_pckl_dict = pickle.load(f)
    
# Read Manually Edited Coarse Classing Output
coarse_classing_df = pd.read_excel(f"{outpath}/{dev_fname.split('.')[0]}_coarse_classing.xlsx", sheet_name='coarse_classing_result', skiprows=2, usecols=lambda x: 'Unnamed' not in x)   # Read Excel File
coarse_classing_df = coarse_classing_df.dropna(how='all', axis=0).reset_index(drop=True)    # Drop Null Rows
dup_header_rowlist = [x for x in range(coarse_classing_df.index.size) if (coarse_classing_df.iloc[x] == coarse_classing_df.columns.tolist()).sum() == len(coarse_classing_df.columns)]
coarse_classing_df = coarse_classing_df.drop(dup_header_rowlist)    # Drop Redundant Headers and Unnecessary Columns
coarse_classing_df[['TOT_ACTS', 'COUNT_RESP', 'COUNT_NON_RESP']] = coarse_classing_df[['TOT_ACTS', 'COUNT_RESP', 'COUNT_NON_RESP']].apply(pd.to_numeric)

<h4> 3.3 Fine Classing

In [3]:
%%time

fine_classing_df, dev_data, val_data = fine_classing(dev_data, val_data, coarse_classing_df)
fine_classing_df.head()

100%|██████████| 18/18 [00:04<00:00,  3.97it/s]

CPU times: user 4.45 s, sys: 108 ms, total: 4.56 s
Wall time: 4.61 s





Unnamed: 0,VAR_NAME,BIN_NUM,VAR_BINS,bin_left,bin_right,TOT_ACTS,ROWP_TOT,COUNT_RESP,PER_RESP,COUNT_NON_RESP,PER_NON_RESP,RAW_ODDS,LN_ODDS,INFO_VAL,CH_SQ,RESP_RATE
0,smoothness_mean,1,"(-inf, 0.07466]",-inf,0.07466,18,0.052786,0,0.0,18,0.083333,0.0,0.0,0.0,10.416667,0.0
1,smoothness_mean,2,"(0.07466, 0.07948]",0.07466,0.07948,17,0.049853,2,0.016,15,0.069444,4.340278,1.467938,0.078453,4.53649,0.117647
2,smoothness_mean,3,"(0.07948, 0.08223]",0.07948,0.08223,17,0.049853,2,0.016,15,0.069444,4.340278,1.467938,0.078453,4.53649,0.117647
3,smoothness_mean,4,"(0.08223, 0.0842]",0.08223,0.0842,17,0.049853,4,0.032,13,0.060185,1.880787,0.63169,0.017804,1.261702,0.235294
4,smoothness_mean,5,"(0.0842, 0.08546]",0.0842,0.08546,17,0.049853,2,0.016,15,0.069444,4.340278,1.467938,0.078453,4.53649,0.117647


<h4> 3.4 Convert Variable Values - WOE Approach

In [4]:
%%time

# Copy Original Data
dev_data_woe = copy.deepcopy(dev_data)
val_data_woe = copy.deepcopy(val_data)

# Replace Original Data with WOE Values
for col in fine_classing_df['VAR_NAME'].drop_duplicates().tolist():
    
    tmp_woe = fine_classing_df[fine_classing_df['VAR_NAME'] == col][['BIN_NUM', 'LN_ODDS']]
    tmp_woe['BIN_NUM'] = tmp_woe['BIN_NUM'].astype(str)
    dev_data_woe[col] = dev_data_woe[col].astype(str)
    dev_data_woe = dev_data_woe.merge(tmp_woe, left_on=col, right_on='BIN_NUM', how='left')
    dev_data_woe.drop([col, 'BIN_NUM'], axis=1, inplace=True)
    dev_data_woe.rename(columns={'LN_ODDS': 'L_'+col}, inplace=True)
    
    val_data_woe[col] = val_data_woe[col].astype(str)
    val_data_woe = val_data_woe.merge(tmp_woe, left_on=col, right_on='BIN_NUM', how='left')
    val_data_woe.drop([col, 'BIN_NUM'], axis=1, inplace=True)
    val_data_woe.rename(columns={'LN_ODDS': 'L_'+col}, inplace=True)
    
# Check and Replace Missing Values
nmiss_dev_df = pd.DataFrame(dev_data_woe.isnull().sum().rename('nmiss')).rename_axis('feature').reset_index()
print(f"Development Variables having missing values: {nmiss_dev_df[nmiss_dev_df['nmiss']>0].index.size}")

nmiss_val_df = pd.DataFrame(val_data_woe.isnull().sum().rename('nmiss')).rename_axis('feature').reset_index()
nmiss_mode_df = dev_data_woe[nmiss_val_df[nmiss_val_df['nmiss']>0]['feature'].tolist()].mode()
val_data_woe.fillna({col: nmiss_mode_df[col][0] for col in nmiss_val_df[nmiss_val_df['nmiss']>0]['feature'].tolist()}, inplace=True)
nmiss_val_df = pd.DataFrame(val_data_woe.isnull().sum().rename('nmiss')).rename_axis('feature').reset_index()
print(f"Validation Variables having missing values: {nmiss_val_df[nmiss_val_df['nmiss']>0].index.size}")

Development Variables having missing values: 0
Validation Variables having missing values: 0
CPU times: user 266 ms, sys: 8.22 ms, total: 275 ms
Wall time: 301 ms


In [5]:
# Export WOE Data
model_data_pckl_dict['dev_data_woe'] = dev_data_woe
model_data_pckl_dict['val_data_woe'] = val_data_woe

<h4> 3.5 Convert Variable Values - Dummy Approach

In [6]:
%%time

# Create Dummy Variables
dev_data_dummy, val_data_dummy, d_ord_enc, d_oh_enc = create_dummy_features(dev_data, val_data, resp_var, id_varlist)

# Check for Missing Values
nmiss_dev_df = pd.DataFrame(dev_data_dummy.isnull().sum().rename('nmiss')).rename_axis('feature').reset_index()
print(f"Development Variables having missing values: {nmiss_dev_df[nmiss_dev_df['nmiss']>0].index.size}")

nmiss_val_df = pd.DataFrame(val_data_dummy.isnull().sum().rename('nmiss')).rename_axis('feature').reset_index()
print(f"Validation Variables having missing values: {nmiss_val_df[nmiss_val_df['nmiss']>0].index.size}")

364 Dummy Variables Created
Starting Correlation Check with 60% cut-off
Variables Dropped: 10. Updated Development Data Shape: (341, 356)
Development Variables having missing values: 0
Validation Variables having missing values: 0
CPU times: user 382 ms, sys: 15.9 ms, total: 397 ms
Wall time: 449 ms


In [7]:
# Update Pickle Model Data
model_data_pckl_dict['dev_data_dummy'] = dev_data_dummy
model_data_pckl_dict['val_data_dummy'] = val_data_dummy

# Update Pickle Metadata
model_metadata_pckl_dict['d_ord_enc'] = d_ord_enc
model_metadata_pckl_dict['d_oh_enc'] = d_oh_enc

<h4> 3.6 Export Fine Classing Data

In [11]:
# Update Pickle Metadata
model_metadata_pckl_dict['c_class_df'] = coarse_classing_df
model_metadata_pckl_dict['f_class_df'] = fine_classing_df

# Save Model data
with open(f'{outpath}/model_data.pickle', 'wb') as f:
    pickle.dump(model_data_pckl_dict, f)
    
# Save Model Metadata
with open(f'{outpath}/model_metadata.pickle', 'wb') as f:
    pickle.dump(model_metadata_pckl_dict, f)

In [9]:
%%time

# Export Fine Classing Output
export_fine_classing_data(fine_classing_df.drop(['bin_left', 'bin_right'], axis=1), outpath, f"{dev_fname.split('.')[0]}_fine_classing.xlsx")

CPU times: user 14.6 s, sys: 1.95 s, total: 16.6 s
Wall time: 14.2 s
