# Simple preprocessing pipeline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator


class CustomOHE(TransformerMixin, BaseEstimator):

    def __init__(self):
        pass

    def fit(self, X):
        X_dummified = X.astype(str)
        X_dummified = X_dummified.applymap(lambda x: x if x not in ["nan", "NaN", "NAN", "Nan", "-1", "-1.0"] else float("nan"))
        X_dummified = pd.get_dummies(X_dummified)
        self.columns = X_dummified.columns
        return self

    def transform(self, X):
        X_dummified = X.astype(str)
        X_dummified = X_dummified.applymap(lambda x: x if x not in ["nan", "NaN", "NAN", "Nan", "-1", "-1.0"] else float("nan"))
        X_dummified = pd.get_dummies(X_dummified)
        # Only keep columns that are computed in the fit() method
        # Drop new dummy columns if new category appears in the test set that were never seen in train set
        X_dummified_reindexed = X_dummified.reindex(columns=self.columns, fill_value=0)
        return X_dummified_reindexed

In [4]:
def_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/defaulter_data_13364.csv", index_col=[0])
pay_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/payer_data_41940.csv", index_col=[0])
def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

In [5]:
y = df['default']

X = df.drop(columns=['default'])

In [6]:
cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

drop columns if they correlate > 95% with others

In [7]:
X_corr = X.corr()

In [8]:
X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

In [9]:
red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

In [10]:
X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

In [11]:
## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')

In [12]:
X_red_corr

Unnamed: 0,feature_1,feature_2,correlation_all
12045,D_74,D_58,0.927332
6471,B_13,B_12,0.921825
457,B_2,B_33,0.913250
728,S_3,S_7,0.903899
28001,D_131,D_132,0.891850
...,...,...,...
9398,B_20,B_2,-0.779728
5981,S_8,S_15,-0.783457
23888,B_39,B_17,-0.805295
11791,D_73,D_108,-0.851429


In [13]:
len(red_features) ## we removed 13 columns

13

drop columns with nans if in __both__ groups > 80% nans

In [14]:
def_nans = def_df.isna().sum()/len(def_df) 

In [15]:
def_nans_80 = def_nans[def_nans >= 0.8].index

In [16]:
pay_nans = pay_df.isna().sum()/len(pay_df)

In [17]:
pay_nans_80 = pay_nans[pay_nans>=0.8].index

In [18]:
nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

In [19]:
## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 

In [20]:
X_red = X_red.drop(columns=red_features_nan)

In [21]:
dropped_columns = red_features + red_features_nan

In [22]:
red_cat_vars = [var for var in cat_vars if var not in dropped_columns] ## categorical variables that are left

In [24]:
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_imputer = SimpleImputer(strategy="mean") ## replace with KNNIMputer
num_scaler = RobustScaler()


num_pipe = make_pipeline(num_imputer, num_scaler)

In [26]:
#make_column_selector(red_cat_vars)

<sklearn.compose._column_transformer.make_column_selector at 0x7f346b8ea310>

In [48]:
cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_encoder = CustomOHE() ## does not work with numpy.arrays in COlumn_transformer yet
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
cat_pipe = make_pipeline(cat_imputer, cat_encoder)
#preprocessing_pipe = make_pipeline

Beware of the Dummy trap. (Check)

In [49]:
num_vars = [feature for feature in X_red.columns[2:] if feature not in cat_vars] ## exclude dates and IDs (first two columns)

In [50]:
len(num_vars)

150

In [51]:
len(X_red.columns[1:])

161

In [52]:
preprocessor = ColumnTransformer([
    ('num_pip', num_pipe, num_vars),
    ('cat_pip', cat_pipe, red_cat_vars)],
    remainder='passthrough'
)

In [55]:
pd.DataFrame(preprocessor.fit_transform(X_red))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,185,186,187,188,189,190,191,192,193,194
0,-0.427817,1.390477,0.219605,0.633174,1.419042,0.376803,0.112298,-0.04552,1.241837,0.866629,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6ba461c93869797c49b0f34c29274e50915466eda02a82...,2017-04-29
1,-0.587316,-0.024268,0.004738,85.887104,0.474849,-0.752905,0.035997,0.312615,1.176238,-0.015821,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6ea315e6f219bc513964121907331f96e5b194127ebee3...,2017-09-10
2,-0.58707,0.365948,0.005095,85.474799,0.730048,-0.628761,0.12569,0.0,-0.115859,5.867669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6f105fec0ab833183a6a7aa3e39e79a6adf88a7cabe9df...,2017-03-26
3,-0.641399,-0.027197,0.214508,0.403981,0.927122,0.463563,0.095218,0.0,-0.12613,5.873947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6f105fec0ab833183a6a7aa3e39e79a6adf88a7cabe9df...,2017-04-26
4,-0.565503,0.374308,0.003313,86.646623,-0.24895,-0.861436,0.113565,0.0,3.716587,4.884302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6f105fec0ab833183a6a7aa3e39e79a6adf88a7cabe9df...,2017-05-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55299,0.279935,-0.028833,-0.004014,-0.765108,3.532037,-0.502783,-0.050771,0.0,0.0,0.866629,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2017-11-04
55300,0.267095,-0.037582,0.002415,0.116283,0.015608,-0.138171,0.026206,0.0,0.0,0.866629,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2017-12-12
55301,0.271628,-0.016493,-0.001598,0.424582,-0.025929,0.05312,-0.008016,0.0,0.0,0.866629,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2018-01-18
55302,0.266345,-0.016073,-0.00416,-0.560179,-0.0644,0.670352,-0.065443,0.0,0.0,0.866629,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2018-02-03
