# Simple preprocessing pipeline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')

In [3]:
def_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/defaulter_data_13364.csv", index_col=[0])
pay_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/payer_data_41940.csv", index_col=[0])
def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

In [4]:
y = df['default']

X = df.drop(columns=['default'])

In [75]:
cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

drop columns if they correlate > 95% with others

In [5]:
X_corr = X.corr()

In [7]:
X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

In [66]:
red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

In [67]:
X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

In [68]:
## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')

In [69]:
X_red_corr

Unnamed: 0,feature_1,feature_2,correlation_all
12045,D_74,D_58,0.927332
6471,B_13,B_12,0.921825
457,B_2,B_33,0.913250
728,S_3,S_7,0.903899
28001,D_131,D_132,0.891850
...,...,...,...
9398,B_20,B_2,-0.779728
5981,S_8,S_15,-0.783457
23888,B_39,B_17,-0.805295
11791,D_73,D_108,-0.851429


In [30]:
len(red_features) ## we removed 13 columns

13

drop columns with nans if in __both__ groups > 80% nans

In [33]:
def_nans = def_df.isna().sum()/len(def_df) 

In [44]:
def_nans_80 = def_nans[def_nans >= 0.8].index

In [39]:
pay_nans = pay_df.isna().sum()/len(pay_df)

In [42]:
pay_nans_80 = pay_nans[pay_nans>=0.8].index

In [45]:
nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

In [70]:
## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 

In [60]:
X_red = X_red.drop(columns=red_features_nan)

In [71]:
dropped_columns = red_features + red_features_nan

In [76]:
red_cat_vars = [var for var in cat_vars if var not in dropped_columns] ## categorical variables that are left

In [78]:
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_imputer = SimpleImputer(strategy="mean")
num_scaler = RobustScaler()


num_pipe = make_pipeline(num_imputer, num_scaler)

In [79]:
num_pipe

In [85]:
make_column_selector(red_cat_vars)

<sklearn.compose._column_transformer.make_column_selector at 0x7fa11ffff670>

In [80]:
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_encoder = OneHotEncoder(sparse=False,dtype=int, handle_unknown='ignore')
#preprocessing_pipe = make_pipeline

In [99]:
X_red[red_cat_vars].dtypes

B_30     float64
B_38     float64
D_114    float64
D_116    float64
D_117    float64
D_120    float64
D_126    float64
D_63      object
D_64      object
D_68     float64
dtype: object

In [104]:
tmp = X_red[red_cat_vars].astype(str)

Beware of the Dummy Trap.

In [126]:
tmp1 = tmp.applymap(lambda x: x if x not in ["nan", "NaN", "NAN", "Nan", "-1", "-1.0"] else float("nan"))

In [127]:
tmp1

Unnamed: 0,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_63,D_64,D_68
0,0.0,2.0,0.0,0.0,3.0,0.0,1.0,CO,U,3.0
1,0.0,2.0,1.0,0.0,2.0,1.0,1.0,CO,O,6.0
2,0.0,2.0,1.0,0.0,,0.0,1.0,CO,O,6.0
3,0.0,2.0,1.0,0.0,,0.0,1.0,CO,O,6.0
4,0.0,2.0,1.0,0.0,,0.0,1.0,CO,O,6.0
...,...,...,...,...,...,...,...,...,...,...
41935,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CR,O,6.0
41936,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CR,O,6.0
41937,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CR,O,6.0
41938,0.0,1.0,1.0,0.0,4.0,0.0,1.0,CR,O,6.0


In [128]:
pd.get_dummies(tmp1)

Unnamed: 0,B_30_0.0,B_30_1.0,B_30_2.0,B_38_1.0,B_38_2.0,B_38_3.0,B_38_4.0,B_38_5.0,B_38_6.0,B_38_7.0,...,D_64_O,D_64_R,D_64_U,D_68_0.0,D_68_1.0,D_68_2.0,D_68_3.0,D_68_4.0,D_68_5.0,D_68_6.0
0,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41935,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
41936,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
41937,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
41938,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [92]:
X[cat_vars].dtypes

B_30     float64
B_38     float64
D_114    float64
D_116    float64
D_117    float64
D_120    float64
D_126    float64
D_63      object
D_64      object
D_66     float64
D_68     float64
dtype: object

In [93]:
X[red_cat_vars].dtypes

B_30     float64
B_38     float64
D_114    float64
D_116    float64
D_117    float64
D_120    float64
D_126    float64
D_63      object
D_64      object
D_68     float64
dtype: object

In [None]:
Pipeline([
    ('num_pip', num_pipe),
    ('my_name_for_scaler', StandardScaler())
])