# Simple preprocessing pipeline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')

def_df = pd.read_csv("../raw_data/defaulter_data_13364.csv", index_col=[0])
pay_df = pd.read_csv("../raw_data/payer_data_41940.csv", index_col=[0])
def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

y = df['default']

X = df.drop(columns=['default'])

cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

#drop columns if they correlate > 95% with others

X_corr = X.corr()

X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')

print(f'{len(red_features)} of features are removed') ## we removed 13 columns

drop columns with nans if in __both__ groups > 80% nans

def_nans = def_df.isna().sum()/len(def_df) 

def_nans_80 = def_nans[def_nans >= 0.8].index

pay_nans = pay_df.isna().sum()/len(pay_df)

pay_nans_80 = pay_nans[pay_nans>=0.8].index

nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 

X_red = X_red.drop(columns=red_features_nan)

dropped_columns = red_features + red_features_nan

df_dropped=df.drop(columns=dropped_columns)

In [24]:
df_dropped.shape

(55304, 163)

In [None]:
red_cat_vars = [var for var in cat_vars if var not in dropped_columns] ## categorical variables that are left

In [None]:
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_imputer = SimpleImputer(strategy="mean") ## replace with KNNIMputer
num_scaler = RobustScaler()


num_pipe = make_pipeline(num_imputer, num_scaler)

In [None]:
#make_column_selector(red_cat_vars)

In [None]:
cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_encoder = CustomOHE() ## does not work with numpy.arrays in COlumn_transformer yet
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
cat_pipe = make_pipeline(cat_imputer, cat_encoder)
#preprocessing_pipe = make_pipeline

Beware of the Dummy trap. (Check)

In [None]:
num_vars = [feature for feature in X_red.columns[2:] if feature not in cat_vars] ## exclude dates and IDs (first two columns)

In [None]:
len(num_vars)

In [None]:
len(X_red.columns[1:])

In [None]:
X_red.index=X_red['customer_ID']
X_red_new=X_red.drop(columns=['customer_ID','S_2'])

In [None]:
y.index=X_red['customer_ID']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_red_new, y, test_size=0.2, random_state=42)

In [None]:
preprocessor = ColumnTransformer([
    ('num_pip', num_pipe, num_vars),
    ('cat_pip', cat_pipe, red_cat_vars)],
    remainder='passthrough'
)

In [None]:
new_data=pd.DataFrame(preprocessor.fit_transform(X_train))

In [None]:
from sklearn.model_selection import cross_validate

lr=LogisticRegression()
lr.fit(new_data,y_train)

In [None]:
X_test_tran=preprocessor.transform(X_test)
y_pre=lr.predict(X_test_tran)

In [None]:
type(y_test)

In [None]:
y_pre_df=pd.DataFrame(data=y_pre,columns=['prediction'])

y_test_df=y_test.to_frame()

y_test_df.columns=['target']

In [None]:
y_test_df.index=y_test.index
y_pre_df.index=X_test.index

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
amex_metric(y_test_df,y_pre_df) 