In [1]:
import os
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from seldonian.utils.io_utils import save_json

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Helpful reference: https://machinelearningmastery.com/imbalanced-classification-of-good-and-bad-credit/

# Point to german.csv file on your filesystem
f_orig = "/Users/ahoag/beri/code/datasets/german_credit/german.csv"

In [3]:
columns_orig = [
    "account_status","months","credit_history",
    "purpose","credit_amount","savings_accounts",
    "employment_since","installment_rate","personal_status",
    "other_debtors","present_residence_since","property",
    "age_yrs","other_installment_plans","housing",
    "num_existing_credits","job","num_people_liable",
    "telephone","foreign_worker","credit_rating"]

def make_seldonian_dataset(input_path,output_path_data,output_path_metadata):
    """ load the dataset into features and label arrays.
    One-hot encode categorical features,
    scale numerical features to have unit variance and 0 mean
    and then encode label column to have binary output: 
    0: good credit and 1: bad credit. 
    Finally, save resulting dataframe to a CSV file
    and make metadata JSON file
    
    :param input_path: The path to the original dataset in CSV format
    :type input_path: str
    
    :param output_path_data: The filename for saving the reformated dataset file 
    :type output_path_data: str

    :param output_path_metadata: The filename for saving the reformated metadata file 
    :type output_path_metadata: str
    """
    
    df = pd.read_csv(input_path,header=None,names=columns_orig)
    
    # split into inputs and outputs
    X = df.drop(columns=["credit_rating"])
    y = df["credit_rating"]
    
    # Need to make male and female columns from personal status column and then remove the personal_status column
    # A91, A93 and A94 are male and A92, A95 are female
    female_mask = np.logical_or(X['personal_status']=='A92',X['personal_status']=='A95')
    X.loc[female_mask,'personal_status'] = "F"
    X.loc[~female_mask,'personal_status'] = "M"
    
    # rename column to sex
    X.rename(columns={'personal_status':'sex'},inplace=True)
    
    # select categorical features and numerical features
    cat_ix = X.select_dtypes(include=['object', 'bool']).columns
    num_ix = X.select_dtypes(include=['int64', 'float64']).columns

    # one hot encode cat features only, scale numerical features using standard scaler 
    ct = ColumnTransformer([('c',OneHotEncoder(),cat_ix), ('n',StandardScaler(),num_ix)])
    
    # Apply transformation
    X = ct.fit_transform(X)

    # label encode the target variable to have the classes 0 and 1
    # 0 is good credit, 1 is bad credit
    y = LabelEncoder().fit_transform(y)
    
    # Get names after one-hot encoding
    output_columns = ct.get_feature_names_out(ct.feature_names_in_)
    
    # Make an output dataframe to save from X and y
    outdf = pd.DataFrame(X,columns=output_columns)
    
    # Change name of the two one-hot encoded sex columns to M and F
    outdf.rename(columns={'c__sex_F':'F','c__sex_M':'M'},inplace=True)
    
    # Add label column into final dataframe
    outdf['credit_rating'] = y
    
    # Save final dataframe
    outdf.to_csv(output_path_data,index=False,header=False)
    print(f"Saved data file to: {output_path_data}")
    print()
    
    # Save metadata json file
    metadata_dict = {
        "regime":"supervised_learning",
        "sub_regime":"classification",
        "columns":list(outdf.columns),
        "label_column":"credit_rating",
        "sensitive_columns":["F","M"]
    }
    
    with open(output_path_metadata,'w') as outfile:
        json.dump(metadata_dict,outfile,indent=2)
    print(f"Saved metadata file to: {output_path_metadata}")
    return 

In [4]:
make_seldonian_dataset(
    input_path=f_orig,
    output_path_data="german_loan_numeric_forseldonian.csv",
    output_path_metadata="metadata_german_loan.json")

Saved data file to: german_loan_numeric_forseldonian.csv

Saved metadata file to: metadata_german_loan.json
