# Simple preprocessing pipeline

In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [79]:
from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator


class CustomOHE(TransformerMixin, BaseEstimator):

    def __init__(self):
        pass

    def fit(self, X):
        X_dummified = X.astype(str)
        X_dummified = X_dummified.applymap(lambda x: x if x not in ["nan", "NaN", "NAN", "Nan", "-1", "-1.0"] else float("nan"))
        X_dummified = pd.get_dummies(X_dummified)
        self.columns = X_dummified.columns
        return self

    def transform(self, X):
        X_dummified = X.astype(str)
        X_dummified = X_dummified.applymap(lambda x: x if x not in ["nan", "NaN", "NAN", "Nan", "-1", "-1.0"] else float("nan"))
        X_dummified = pd.get_dummies(X_dummified)
        # Only keep columns that are computed in the fit() method
        # Drop new dummy columns if new category appears in the test set that were never seen in train set
        X_dummified_reindexed = X_dummified.reindex(columns=self.columns, fill_value=0)
        return X_dummified_reindexed

In [4]:
def_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/defaulter_data_13364.csv", index_col=[0])
pay_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/payer_data_41940.csv", index_col=[0])
def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

In [5]:
y = df['default']

X = df.drop(columns=['default'])

Discarding columns if too highly correlated with other or too many Nans

In [6]:
cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

drop columns if they correlate > 95% with others

In [7]:
X_corr = X.corr()

In [8]:
X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

In [9]:
red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

In [10]:
X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

In [11]:
## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')

In [12]:
X_red_corr

Unnamed: 0,feature_1,feature_2,correlation_all
12045,D_74,D_58,0.927332
6471,B_13,B_12,0.921825
457,B_2,B_33,0.913250
728,S_3,S_7,0.903899
28001,D_131,D_132,0.891850
...,...,...,...
9398,B_20,B_2,-0.779728
5981,S_8,S_15,-0.783457
23888,B_39,B_17,-0.805295
11791,D_73,D_108,-0.851429


In [13]:
len(red_features) ## we removed 13 columns

13

drop columns with nans if in __both__ groups > 80% nans

In [14]:
nan_threshold= 0.8 ## adjust the hardcoded values

In [15]:
def_nans = def_df.isna().sum()/len(def_df) 

In [16]:
def_nans_80 = def_nans[def_nans >= 0.8].index

In [17]:
pay_nans = pay_df.isna().sum()/len(pay_df)

In [18]:
pay_nans_80 = pay_nans[pay_nans>=0.8].index

In [19]:
nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

In [20]:
## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 

In [21]:
X_red = X_red.drop(columns=red_features_nan)

In [22]:
dropped_columns = red_features + red_features_nan

In [62]:
str_vars + cat_vars

['D_63',
 'D_64',
 'B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126',
 'D_63',
 'D_64',
 'D_66',
 'D_68']

Building the pipeline

In [115]:
num_vars = [feature for feature in X_red.columns[2:] if feature not in cat_vars] ## exclude dates and IDs (first two columns)
str_vars = [feature for feature in X_red.columns[2:] if not pd.api.types.is_numeric_dtype(X_red[feature])] ## columns that are not numeric at all 
red_cat_vars = [feature for feature in cat_vars if feature not in dropped_columns] ## remaining categorical variables that have no string values

mark rows in categorical columns with values of -1 as NaNs

In [24]:
#X_red[red_cat_vars] = X_red[red_cat_vars].applymap(lambda x: np.nan if x in [-1,-1.0, "-1.0", "-1"] else x)

In [46]:
def nan_imp(X): ## imputes nan values for alternative values signifying nans
    nan_list = [-1,-1.0, "-1.0", "-1"] 
    return X.applymap(lambda x: np.nan if x in nan_list else x) ## perhaps subfunctions for arrays

In [116]:
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_scaler = RobustScaler()
num_imputer = SimpleImputer(strategy='mean')
#num_imputer = KNNImputer(n_neighbors=2) ## KNNIMputer is computationally demanding
## should come AFTER SCALING

num_pipe = make_pipeline(num_scaler, num_imputer)

str_trans = OrdinalEncoder() # is only needed if one wants to do knnimputer

nan_trans = FunctionTransformer(nan_imp)
cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_imputer = KNNImputer(n_neighbors=1) # introducing it did not improve performance, but is computationally demanding
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') ## what happens to the old columns?
cat_pipe = make_pipeline(nan_trans, cat_imputer, cat_encoder)
str_pipe = make_pipeline(nan_trans, str_trans, cat_imputer, cat_encoder)

Beware of the Dummy trap. OneHotEncoder automatically deletes one column.

In [102]:
str_pipe

In [117]:
preprocessor = ColumnTransformer([
    ('num_pip', num_pipe, num_vars),
    ('cat_pip', cat_pipe, red_cat_vars), 
    ('str_pip', str_pipe, str_vars)],
    remainder='drop' ## all columns not in num_vars and red_cat_vars are dropped.
)

In [118]:
preprocessor

In [119]:
X_pp = pd.DataFrame(preprocessor.fit_transform(X_red))

In [105]:
X_red[red_cat_vars].nunique().sum() ## number of unqie values in all cat. values
## should equal the number of new onehotencoded columns, but it does not


40

In [None]:
# X_red_cat = X_red[red_cat_vars]
# X_dummified = X_red_cat.astype(str)
# X_dummified = X_dummified.applymap(lambda x: x if x not in ["nan", "NaN", "NAN", "Nan", "-1", "-1.0"] else float("nan"))
# X_dummified = pd.get_dummies(X_dummified)
# ## here I additionally make -1s to NaNs, thus 40, not 43 new columns come out. 
# #there only 33 come out ... dummy trap?

In [108]:
# defaulters = pd.DataFrame(def_df['customer_ID'].unique())
# payers = pd.DataFrame(pay_df['customer_ID'].unique())
# defaulters['default']=1
# payers['default']=0

# default_per_customer = pd.concat([defaulters, payers])

# y_unique = default_per_customer['default']

#ynp.r_[def_df['customer_ID'].unique(), pay_df['customer_ID'].unique()].shape

Building the simple model and putting it into the pipe

In [120]:
mod = LogisticRegression() 
## such a model treats all rows as independent despite them being from the same person
results = cross_validate(mod, X_pp, y, cv = 5, scoring=['accuracy', 'recall', 'f1'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [114]:
results

{'fit_time': array([1.9299469 , 1.79605651, 1.71983409, 2.16600227, 1.70930767]),
 'score_time': array([0.02362657, 0.02491212, 0.02906871, 0.02416158, 0.03222513]),
 'test_accuracy': array([0.84305217, 0.83636199, 0.84739174, 0.85064642, 0.83218807]),
 'test_recall': array([0.67190423, 0.57575758, 0.62813318, 0.63000374, 0.59431138]),
 'test_f1': array([0.67417417, 0.6297054 , 0.66547761, 0.67091633, 0.63116057])}

In [106]:
results

{'fit_time': array([1.96157551, 1.93135667, 1.73852777, 1.78588533, 1.70706511]),
 'score_time': array([0.03117967, 0.0251298 , 0.02462554, 0.02541804, 0.02456045]),
 'test_accuracy': array([0.83916463, 0.838803  , 0.84359461, 0.85353946, 0.839783  ]),
 'test_recall': array([0.63262252, 0.60007482, 0.61728395, 0.64197531, 0.62649701]),
 'test_f1': array([0.65529936, 0.64275696, 0.65606362, 0.67933492, 0.65390625])}

In [111]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [113]:
pp_pred_pipe = make_pipeline(preprocessor, mod)

In [114]:
pp_pred_pipe

In [115]:
pp_pred_pipe.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [116]:
import pickle

In [118]:
pickle.dump(pp_pred_pipe, open('pp_pred_pipe', 'wb'))

In [120]:
#loaded_model = pickle.load(open('pp_pred_pipe', 'rb'))

In [121]:
#loaded_model