# Simple preprocessing pipeline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')

In [3]:
import warnings 
warnings.filterwarnings('ignore')

In [6]:
def_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/defaulter_data_13364.csv", index_col=[0])
pay_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/payer_data_41940.csv", index_col=[0])
def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

In [5]:
y = df['default'].reset_index(drop=True)

X = df.drop(columns=['default']).reset_index(drop=True)

Discarding columns if too highly correlated with other or too many Nans

In [6]:
cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

drop columns if they correlate > 95% with others

In [7]:
X_corr = X.corr()

In [8]:
X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

In [9]:
red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

In [10]:
X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

In [11]:
## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')

In [12]:
X_red_corr

Unnamed: 0,feature_1,feature_2,correlation_all
12045,D_74,D_58,0.927332
6471,B_13,B_12,0.921825
457,B_2,B_33,0.913250
728,S_3,S_7,0.903899
28001,D_131,D_132,0.891850
...,...,...,...
9398,B_20,B_2,-0.779728
5981,S_8,S_15,-0.783457
23888,B_39,B_17,-0.805295
11791,D_73,D_108,-0.851429


In [13]:
len(red_features) ## we removed 13 columns

13

drop columns with nans if in __both__ groups > 80% nans

In [14]:
nan_threshold= 0.8 ## adjust the hardcoded values

In [15]:
def_nans = def_df.isna().sum()/len(def_df) 

In [16]:
def_nans_80 = def_nans[def_nans >= nan_threshold].index

In [17]:
pay_nans = pay_df.isna().sum()/len(pay_df)

In [18]:
pay_nans_80 = pay_nans[pay_nans>= nan_threshold].index

In [19]:
nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

In [20]:
## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 

In [21]:
X_red = X_red.drop(columns=red_features_nan)

In [22]:
dropped_columns = red_features + red_features_nan

Building the pipeline

In [152]:
num_vars = [feature for feature in X_red.columns[2:] if feature not in cat_vars] ## exclude dates and IDs (first two columns)
str_vars = [feature for feature in X_red.columns[2:] if not pd.api.types.is_numeric_dtype(X_red[feature])] ## columns that are not numeric at all 
#red_cat_vars = [feature for feature in cat_vars if feature not in dropped_columns + str_vars] ## remaining categorical variables that have no string values
red_cat_vars = [feature for feature in cat_vars if feature not in dropped_columns + str_vars] ## remaining categorical variables 

mark rows in categorical columns with values of -1 as NaNs

In [24]:
#X_red[red_cat_vars] = X_red[red_cat_vars].applymap(lambda x: np.nan if x in [-1,-1.0, "-1.0", "-1"] else x)

In [122]:
# def nan_imp(X): ## imputes nan values for alternative values signifying nans
#     nan_list = [-1,-1.0, "-1.0", "-1"] 
#     return X.applymap(lambda x: np.nan if x in nan_list else x) ## perhaps subfunctions for arrays

In [147]:
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_imputer = SimpleImputer(strategy='mean')
num_scaler = RobustScaler()

#num_imputer = KNNImputer(n_neighbors=2) ## KNNIMputer is computationally demanding
## should come AFTER SCALING

num_pipe = make_pipeline(num_imputer, num_scaler)

#str_trans = OrdinalEncoder() # is only needed if one wants to do knnimputer

#nan_trans = FunctionTransformer(nan_imp)

#nan_trans = FunctionTransformer(lambda X: X.applymap(lambda x: np.nan if x in [-1,-1.0, "-1.0", "-1"] else x))

cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_imputer = KNNImputer(n_neighbors=1) # introducing it did not improve performance, but is computationally demanding
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') ## what happens to the old columns?
cat_pipe = make_pipeline(cat_imputer, cat_encoder)
#str_pipe = make_pipeline(cat_imputer, str_trans, cat_encoder)
#str_pipe = make_pipeline(cat_imputer, cat_encoder)

Beware of the Dummy trap. OneHotEncoder automatically deletes one column.

# A self-contained alternative Nans imputer

In [124]:
import pandas as pd
import numpy as np

def alt_nan_imp(X):
    
    cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']
    
    alt_nan_list = [-1,-1.0, "-1.0", "-1"]
    
    cat_columns = [column for column in X.columns if column in cat_vars]
    
    X[cat_columns] = X[cat_columns].applymap(lambda x: pd.NA if x in alt_nan_list else x)

alt_nan_imp(X_red)

In [153]:
preprocessor = ColumnTransformer([
    ('num_pip', num_pipe, num_vars),
    ('cat_pip', cat_pipe, red_cat_vars)],
    remainder='drop' ## all columns not in num_vars and red_cat_vars are dropped.
)

In [154]:
alt_nan_imp(X)
preprocessor.fit(X)

In [32]:
X_pp = pd.DataFrame(preprocessor.fit_transform(X_red))

In [34]:
X_pp['customer_ID'] = X_red['customer_ID']

In [36]:
X_pp_avg = X_pp.groupby('customer_ID').mean()

In [38]:
y_ID = pd.DataFrame(y)

In [39]:
y_ID['customer_ID'] = X_red['customer_ID']

In [40]:
y_unique = y_ID.groupby('customer_ID').mean().astype(int) ## actually, this data is just in train_labels

In [41]:
X_pp_avg_train, X_pp_avg_val, y_unique_train, y_unique_val = train_test_split(X_pp_avg, y_unique, test_size=0.3) 

In [34]:
X_red[red_cat_vars].nunique().sum() ## number of unqie values in all cat. values
## should equal the number of new onehotencoded columns - 1*number of features. The latter to avoid Dummy Trap.


43

Building the simple model and putting it into the pipe

In [35]:
#mod = LogisticRegression() 
## such a model treats all rows as independent despite them being from the same person
#results = cross_validate(mod, X_pp, y, cv = 5, scoring=['accuracy', 'recall', 'f1'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [36]:
results

{'fit_time': array([2.77106905, 1.99161935, 2.06714058, 2.42431951, 3.8680737 ]),
 'score_time': array([0.02546883, 0.02748156, 0.02438974, 0.03599572, 0.03139949]),
 'test_accuracy': array([0.84061116, 0.84187686, 0.85200253, 0.85200253, 0.8358047 ]),
 'test_recall': array([0.63598952, 0.59259259, 0.63711186, 0.6285073 , 0.58196108]),
 'test_f1': array([0.65853186, 0.6442953 , 0.67539163, 0.67240344, 0.63134389])}

In [37]:
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

## Train_test_split needs to be on preprocessed data


In [38]:
#pp_pred_pipe = make_pipeline(preprocessor, mod)

In [39]:
#pp_pred_pipe

In [40]:
pp_pred_pipe.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Trying pickling

In [41]:
# # import pickle

# # pickle.dump(pp_pred_pipe, open('pp_pred_pipe', 'wb'))

# # loaded_model = pickle.load(open('../pickles/pp_pred_pipe', 'rb'))



# sample = X_red.sample(1)

# loaded_model.predict_proba(sample)[0][1] * 100

# Creating custom amex scoring metric

In [50]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    
    ## TWEAK
    y_true = pd.DataFrame(y_true.reset_index(drop=True))
    y_pred = pd.DataFrame(y_pred)
    
    y_true = y_true.rename(columns={y_true.columns[0]:'target'})
    y_pred = y_pred.rename(columns={y_pred.columns[0]:'prediction'})
    ##
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [43]:
from sklearn.metrics import make_scorer

In [44]:
amex_metric_scorer = make_scorer(amex_metric)

In [45]:
## dict of scoring metrics one might want to pass into cross validation
scorings = {'recall':'recall',
            'f1':'f1',
           'amex': amex_metric_scorer}

# Averaging preprocessed X and y

In [46]:
X_pp['customer_ID'] = X_red['customer_ID']

In [47]:
X_avg_pp = X_pp.groupby('customer_ID').mean()

In [48]:
y_ID = pd.DataFrame(y)

In [49]:
y_ID['customer_ID'] = X_red['customer_ID']

In [50]:
y_unique = y_ID.groupby('customer_ID').mean().astype(int) ## actually, this data is just in train_labels

In [99]:
y_unique

Unnamed: 0_level_0,default
customer_ID,Unnamed: 1_level_1
000919ba92d9a04c28e1e49f6cd855ca36e1df7c79cc0583b60734da25265fb1,0
00158cf08fcf7ec058529dd71b4cff04ce89314e79840b76a183afa0ae941c34,0
0018753794d55ff4ab24aaa7f8e65d504fd1efbf04e369f166dc2f2158513462,1
0044d8693a5c204d5e22297b11e566d9de1c1610899d20f46abff9a163f78b3e,0
00597bc3d552264d841bd1a52cfaf3ebe40755f96d85a5282695654cee8af21f,1
...,...
ffbdeb593c97bc39bf6228e7236796cf5dcd530ef73014604e6f4a77506c0e02,0
ffbff903bdd4104397101a428a4c5c4daa4d28e26f87166965f780a4127a7a0b,0
ffd4ac1fe3746d586e00906a56ae8a26fdad358f5b478912fe82a974ab302f34,0
ffd6a301b0f94dec47dda9fec844cdc640a04bb38e8707a3e10fd9dc09069d3d,1


# Trying out various models on averaged data

In [51]:
# mod = LogisticRegression() 
# ## such a model treats all rows as independent despite them being from the same person
# results = cross_validate(mod, X_avg_pp, y_unique['default'], cv = 5, scoring=scorings)

In [52]:
# results

In [53]:
# cross_val_predict(mod, X_avg_pp, y_unique['default'], cv = 5, scoring=scorings, method='predict_proba')

In [54]:
# # mod = LogisticRegression(penalty='l1',solver='liblinear') 
# ## such a model treats all rows as independent despite them being from the same person
# results = cross_validate(mod, X_avg_pp, y_unique, cv = 5, scoring=scorings)

In [55]:
# results

In [56]:
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier

In [57]:
# mod_perc = Perceptron(penalty='l1')

In [58]:
# results_perc = cross_validate(mod_perc, X_avg_pp, y_unique, cv = 5, scoring=scorings)
# results_perc['test_recall'].mean()

In [59]:
# results

In [60]:
# mod_agg = PassiveAggressiveClassifier()
# results_agg = cross_validate(mod_agg, X_avg_pp, y_unique, cv = 5, scoring=scorings)
# results_agg

In [61]:
from sklearn.neighbors import KNeighborsClassifier

In [62]:
# # knn = KNeighborsClassifier(n_neighbors=5)
# results_knn = cross_validate(knn, X_avg_pp, y_unique, cv = 5, scoring=scorings)
# results_knn['test_recall'].mean()

In [63]:
# results_knn

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [65]:
# mod_gbc = GradientBoostingClassifier()

In [66]:
# results_gbc = cross_validate(mod_gbc, X_avg_pp, y_unique, cv = 5, scoring=scorings)
# results_gbc

In [67]:
# results_gbc

# Trying out with predict_proba

In [68]:
# pp_pred_pipe_gbc = make_pipeline(preprocessor, mod_gbc)

In [69]:
# pp_pred_pipe_gbc.fit(X_train, y_train);

In [70]:
# y_pred = pp_pred_pipe_gbc.predict_proba(X_val)[:,1]

In [71]:
# amex_metric(y_val, y_pred)

Prepare for pickle that will be used on new data

In [72]:
# mod_gbc_ws = GradientBoostingClassifier(warm_start=True) ## makes the model incrementally trainable
# pp_pred_pipe_gbc_ws = make_pipeline(preprocessor, mod_gbc_ws)
# pp_pred_pipe_gbc_ws.fit(X, y);

In [73]:
# import pickle

# pickle.dump(pp_pred_pipe_gbc_ws, open('../pickles/pp_pred_pipe_gbc_ws.pkl', 'wb'))

# # loaded_model = pickle.load(open('../pickles/pp_pred_pipe', 'rb'))



# sample = X_red.sample(1)

# loaded_model.predict_proba(sample)[0][1] * 100

In [74]:
# mod = LogisticRegression(penalty='l1',solver='liblinear') 
# pp_pred_pipe_log = make_pipeline(preprocessor, mod)
# pp_pred_pipe_log.fit(X_train, y_train);

In [75]:
# y_pred = pp_pred_pipe_log.predict_proba(X_val)[:,1]

In [76]:
# amex_metric(y_val, y_pred)

In [77]:
# mod = LogisticRegression(penalty='l1',solver='liblinear', warm_start=True) 

In [78]:
# pp_pred_pipe_log = make_pipeline(preprocessor, mod)
# pp_pred_pipe_log.(X_train, y_train);

# more efficient way to drop NANs

In [79]:
# X.dropna(axis=1, thresh=int(0.2*len(X))) 
# ## however, here you cannot specify that it should be nns of 80% in BOTH groups

# New [Light gradient boosting model](https://github.com/microsoft/LightGBM)
Todo: combine with [Optuna](https://optuna.org/)

In [155]:
import lightgbm as lgb

In [156]:
model_lgb = lgb.LGBMClassifier(boosting_type='goss', max_depth=5)

In [157]:
model_lgb.fit(X_pp_avg_train, y_unique_train)

In [53]:
model_lgb

In [48]:
y_pred = model_lgb.predict_proba(X_pp_avg_val)[:,1]

In [51]:
amex_metric(y_unique_val, y_pred)

0.6791119303974019

In [163]:
pipe_lgb = make_pipeline(preprocessor, model_lgb)

In [164]:
alt_nan_imp(X)

In [165]:
pipe_lgb.fit(X, y)

In [166]:
import pickle

pickle.dump(pipe_lgb, open('../pickles/pp_pred_pipe_gbc_new1.pkl', 'wb'))

In [133]:
y_pred = pipe_lgb.predict_proba(X_pp_avg_val)[:,1]

In [137]:
ids = X_val['customer_ID']

In [141]:
y_val.reset_index(drop=True)

0        0
1        1
2        0
3        1
4        0
        ..
16587    1
16588    1
16589    0
16590    0
16591    0
Name: default, Length: 16592, dtype: int64

In [139]:
ids.reset_index(drop=True)

0        97f7393af2d28cf58a1a2a34010c8dcd1a814c1d22bc27...
1        e033bd1ab915c24d654cf61330683b0c189b8c2a79798e...
2        e5d562860cb6e65aba15852aaff58714a273aa6d8f5417...
3        2427d8b86f1cf895ab720418e34601ce12d8ec07a3c6b5...
4        c3a0fb8ed913223af1429502287bdc27e889a08b30db69...
                               ...                        
16587    ab1aafcc1971e2855930a15f91b833be1d2c1575b934d7...
16588    4ba2250dee222fde6f214a4707f23543d3c37bb465061d...
16589    336b17848ca1a3ad552eccb8b6351585ad90e870dbc622...
16590    f2285229dc890336d3b5b714a8c777de887f6193bbadf5...
16591    7844b7248f01085196a798202e955ccddca1b2c79feeca...
Name: customer_ID, Length: 16592, dtype: object

In [154]:
pd.DataFrame(y_pred, columns=['prediction']).set_index(ids)

Unnamed: 0_level_0,prediction
customer_ID,Unnamed: 1_level_1
97f7393af2d28cf58a1a2a34010c8dcd1a814c1d22bc272322b907d6ddc546c9,0.001806
e033bd1ab915c24d654cf61330683b0c189b8c2a79798eb0d5607e511fdcf5d9,0.520150
e5d562860cb6e65aba15852aaff58714a273aa6d8f5417a7de5b792304681e19,0.491241
2427d8b86f1cf895ab720418e34601ce12d8ec07a3c6b54dee02173cf3a8bf69,0.905360
c3a0fb8ed913223af1429502287bdc27e889a08b30db6949432b274386fa38dd,0.019803
...,...
ab1aafcc1971e2855930a15f91b833be1d2c1575b934d7525b953c6f919035ff,0.760624
4ba2250dee222fde6f214a4707f23543d3c37bb465061dac49eea6b5ab929c9f,0.697798
336b17848ca1a3ad552eccb8b6351585ad90e870dbc622e1f0b9553f31f23954,0.035275
f2285229dc890336d3b5b714a8c777de887f6193bbadf5fb56b406fc9e8a8818,0.009938


In [124]:
amex_metric(y_val, y_pred)

0.8084525740632692

In [125]:
X_avg_pp_train, X_avg_pp_val, y_avg_train, y_avg_val = train_test_split(X_avg_pp, y_unique)

In [143]:
model_lgb = lgb.LGBMClassifier(boosting_type='goss', max_depth=5)
model_lgb.fit(X_avg_pp_train, y_avg_train)

In [127]:
y_pred = model_lgb.predict_proba(X_avg_pp_val)[:,1]

In [128]:
amex_metric(y_avg_val, y_pred)

0.695073714956872

In [52]:
pipe_lgb

NameError: name 'pipe_lgb' is not defined