# Original Data Import and Merging

Same as in 01_data_combine_and_clean but with filtered transaction data

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)

loans_base_df = pd.read_csv('./csv_files/loans_merged_filtered_transactions.csv')


# Column Transformer

In [31]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_var = [ #'amount_order','k_symbol_LEASING',
            #'k_symbol_Other', 'k_symbol_POJISTNE',
            #'k_symbol_SIPO', 'k_symbol_UVER',
            'amount_sum',
            'amount_mean', 'amount_std', 
            'balance_min', 'balance_max', 'balance_mean', 'balance_std',
            'type_PRIJEM_sum', 'type_VYBER_sum',
            'type_VYDAJ_sum', 'operation_Other_sum', 'operation_PREVOD NA UCET_sum',
            'operation_PREVOD Z UCTU_sum', 'operation_VKLAD_sum',
            'operation_VYBER_sum', 'operation_VYBER KARTOU_sum',
            #'k_symbol_DUCHOD_sum', 
            'k_symbol_Other_sum', 'k_symbol_POJISTNE_sum',
            'k_symbol_SANKC. UROK_sum', 'k_symbol_SIPO_sum', 'k_symbol_SLUZBY_sum',
            'k_symbol_UROK_sum', 
            #'k_symbol_UVER_sum', 
            'A4', 'A5', 'A6', 'A7', 'A8',
            'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']

ord_var = ['gender','frequency','type_y']
date_var = ['date_x','date_y', 'date_min', 'date_max','birth_date']
drop_var = ['amount_x','duration','payments','loan_id','account_id','district_id','type_x','issued']
#pass_col = 

loans_base_df = loans_base_df.replace([np.inf, -np.inf], np.nan)

target_raw = loans_base_df['status'].copy()
target_raw[(target_raw == 'A') | (target_raw == 'C')] = 0
target_raw[(target_raw == 'B')  | (target_raw == 'D')] = 1


#Build a column transforrner

#numeric_transformer = SimpleImputer(missing_values = np.nan, strategy='median')
 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values = np.nan, strategy='median')),
    ('scaler', StandardScaler())])    
    
binary_transformer = OneHotEncoder(handle_unknown='ignore')

ordinal_transformer = OneHotEncoder(categories  = [ ['M','F'],
                                                     ['POPLATEK PO OBRATU','POPLATEK TYDNE','POPLATEK MESICNE'],
                                                     ['No Card','junior','classic','gold']])

preprocessor = ColumnTransformer(transformers = [ ('numerical', numeric_transformer, num_var),
                                                  ('ordinal',ordinal_transformer, ord_var)],
                                                  #('date','passthrough', date_var)],
                                                   remainder = 'drop')

X = loans_base_df.drop(columns = 'status')
y = target_raw 


#Save X, Y

#loans_base_df.to_csv('loan_merged_original.csv')


X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2, random_state = 42)


In [32]:
X_train_transformed = preprocessor.fit_transform(X_train)

X_val_transformed = preprocessor.fit_transform(X_val)

y_val = y_val.astype(bool)


In [33]:

ord_var_list = ['M','F'] +  ['POPLATEK PO OBRATU','POPLATEK TYDNE','POPLATEK MESICNE'] + ['No Card','junior','classic','gold']

transformed_columns = num_var +  ord_var_list # + date_var

print(X_train_transformed.shape)
print(len(transformed_columns))

X_train_transform_df = pd.DataFrame(X_train_transformed, columns = transformed_columns)

X_train_transform_df.head()

(545, 44)
44


Unnamed: 0,amount_sum,amount_mean,amount_std,balance_min,balance_max,balance_mean,balance_std,type_PRIJEM_sum,type_VYBER_sum,type_VYDAJ_sum,operation_Other_sum,operation_PREVOD NA UCET_sum,operation_PREVOD Z UCTU_sum,operation_VKLAD_sum,operation_VYBER_sum,operation_VYBER KARTOU_sum,k_symbol_Other_sum,k_symbol_POJISTNE_sum,k_symbol_SANKC. UROK_sum,k_symbol_SIPO_sum,k_symbol_SLUZBY_sum,k_symbol_UROK_sum,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,M,F,POPLATEK PO OBRATU,POPLATEK TYDNE,POPLATEK MESICNE,No Card,junior,classic,gold
0,0.948068,0.998033,1.086245,-0.11569,0.853546,1.277341,0.399259,0.988251,1.795386,0.697803,0.783667,-0.819272,2.214806,-0.929416,1.124719,1.058555,1.041797,-0.214299,-0.071571,-0.7666,-0.105292,0.783667,-0.340258,-0.543242,1.242068,1.602471,0.270864,-0.151634,-1.37618,-0.7264,-0.895419,-0.792573,-0.500862,-0.435657,-0.420331,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.066088,-1.190396,-1.196385,-0.11569,-1.165943,-0.992643,-1.45894,-0.112171,-0.734089,0.104868,0.119617,1.733027,-0.530665,0.339445,-0.382313,-0.147024,-0.223849,4.601558,-0.071571,0.60119,0.406631,0.119617,-0.405992,-0.454369,-0.033829,0.068361,0.270864,0.916992,0.591623,-0.382954,-0.841526,-0.72224,0.321206,-0.341746,-0.343146,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.489164,-0.252414,0.034337,-0.71509,-0.162213,-0.952224,0.778157,-0.512892,2.014846,-0.860702,-0.39173,-0.819272,-0.530665,-0.03847,-0.320253,-0.147024,-0.433401,-0.214299,0.697025,-0.7666,-0.470951,-0.39173,0.167111,-1.19498,-1.373521,-1.246589,-0.629263,-1.576469,1.593049,0.909387,0.990845,0.88604,-0.890262,0.17285,0.083187,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.867363,0.699664,0.823531,-0.013229,0.59812,-0.339217,0.072504,0.82033,1.115708,0.823854,-0.094028,2.1425,1.99728,-0.901308,0.523921,-0.147024,0.661869,-0.214299,-0.071571,2.815221,4.526394,-0.094028,-0.587681,-0.543242,-0.352803,0.28752,-0.629263,-0.151634,-0.815382,-0.341464,0.128553,0.023289,0.148139,-0.468767,-0.453821,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.702571,-0.844159,-0.883131,0.089233,-0.943938,-0.543851,-1.21503,-0.710423,-0.734089,-0.644221,-0.527622,-0.08615,-0.530665,-0.225357,-0.717954,-0.147024,-0.676052,-0.214299,-0.071571,-0.484337,-0.324688,-0.527622,2.675973,-1.19498,-1.373521,-1.246589,-0.629263,-1.576469,1.593049,2.344639,-1.434351,-1.46308,2.008608,2.707004,2.716704,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [34]:
X_train_transform_df.isna().sum()

amount_sum                      0
amount_mean                     0
amount_std                      0
balance_min                     0
balance_max                     0
balance_mean                    0
balance_std                     0
type_PRIJEM_sum                 0
type_VYBER_sum                  0
type_VYDAJ_sum                  0
operation_Other_sum             0
operation_PREVOD NA UCET_sum    0
operation_PREVOD Z UCTU_sum     0
operation_VKLAD_sum             0
operation_VYBER_sum             0
operation_VYBER KARTOU_sum      0
k_symbol_Other_sum              0
k_symbol_POJISTNE_sum           0
k_symbol_SANKC. UROK_sum        0
k_symbol_SIPO_sum               0
k_symbol_SLUZBY_sum             0
k_symbol_UROK_sum               0
A4                              0
A5                              0
A6                              0
A7                              0
A8                              0
A9                              0
A10                             0
A11           

# Build and Run Model

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import eli5


lr_model = LogisticRegression(random_state = 42)

lr_model.fit(X_train_transform_df, y_train.astype(bool))

#eli5.show_weights(rf_model, feature_names = transformed_columns)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']
}

grid_search = RandomizedSearchCV(lr_model, param_grid, cv=5, verbose=1, random_state=42, n_iter=300, scoring='balanced_accuracy')
grid_search.fit(X_train_transform_df, y_train.astype(bool))

print(("best RF from grid search: %.3f"
       % grid_search.score(X_val_transformed, y_val)))

Fitting 5 folds for each of 40 candidates, totalling 200 fits


The total space of parameters 40 is smaller than n_iter=300. Running 40 iterations. For exhaustive searches, use GridSearchCV.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Liblinear failed to converge, increase the number of iterations.
Liblinear failed to converge, increase the number of iterations.
Liblinear failed to converge, increase the number of iterations.


best RF from grid search: 0.746


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   23.0s finished


In [37]:
results  = pd.DataFrame(grid_search.cv_results_)

y_val.sum()

16

In [38]:
results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_solver,param_penalty,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,0.010797,0.000902,0.001531,0.000308,liblinear,l2,29.7635,"{'solver': 'liblinear', 'penalty': 'l2', 'C': ...",0.698024,0.739691,0.666667,0.687715,0.734536,0.705326,0.0279,1
29,0.01379,0.001234,0.001582,4.3e-05,liblinear,l2,78.476,"{'solver': 'liblinear', 'penalty': 'l2', 'C': ...",0.698024,0.739691,0.666667,0.68256,0.734536,0.704296,0.028618,2
33,0.017336,0.001739,0.001471,2.3e-05,liblinear,l2,545.559,"{'solver': 'liblinear', 'penalty': 'l2', 'C': ...",0.698024,0.739691,0.666667,0.68256,0.734536,0.704296,0.028618,2
31,0.015116,0.00168,0.001477,3.6e-05,liblinear,l2,206.914,"{'solver': 'liblinear', 'penalty': 'l2', 'C': ...",0.698024,0.739691,0.666667,0.68256,0.734536,0.704296,0.028618,2
39,0.023341,0.003047,0.001596,3.5e-05,liblinear,l2,10000.0,"{'solver': 'liblinear', 'penalty': 'l2', 'C': ...",0.692869,0.739691,0.666667,0.68256,0.734536,0.703265,0.028917,5
26,0.282498,0.054632,0.001434,0.000477,liblinear,l1,29.7635,"{'solver': 'liblinear', 'penalty': 'l1', 'C': ...",0.692869,0.739691,0.666667,0.68256,0.734536,0.703265,0.028917,5
28,0.587484,0.328194,0.00165,4.5e-05,liblinear,l1,78.476,"{'solver': 'liblinear', 'penalty': 'l1', 'C': ...",0.692869,0.739691,0.666667,0.68256,0.734536,0.703265,0.028917,5
30,0.807343,0.257636,0.001695,0.000125,liblinear,l1,206.914,"{'solver': 'liblinear', 'penalty': 'l1', 'C': ...",0.692869,0.739691,0.666667,0.68256,0.734536,0.703265,0.028917,5
34,0.532054,0.288256,0.001228,0.000187,liblinear,l1,1438.45,"{'solver': 'liblinear', 'penalty': 'l1', 'C': ...",0.692869,0.739691,0.666667,0.68256,0.734536,0.703265,0.028917,5
35,0.013064,0.001395,0.001109,2.9e-05,liblinear,l2,1438.45,"{'solver': 'liblinear', 'penalty': 'l2', 'C': ...",0.692869,0.739691,0.666667,0.68256,0.734536,0.703265,0.028917,5


In [39]:
y_pred_val = grid_search.predict(X_val_transformed)


from sklearn.metrics import confusion_matrix

x = confusion_matrix(y_val, y_pred_val)

tn = x[0,0]  
fp = x[0,1] 
fn = x[1,0] 
tp = x[1,1]

print([tp,fp])
print([fn,tn])



[8, 1]
[8, 120]


In [40]:
eli5.show_weights(grid_search.best_estimator_, feature_names = transformed_columns)

Weight?,Feature
+3.691,k_symbol_SANKC. UROK_sum
+2.362,A15
+1.822,operation_VYBER_sum
+1.630,type_VYDAJ_sum
+1.492,amount_std
+1.234,balance_std
+0.902,A16
… 11 more positive …,… 11 more positive …
… 14 more negative …,… 14 more negative …
-0.917,A13


In [41]:
#Apply Model to other accounts, then see how they cluster...

grid_search.best_params_

{'solver': 'liblinear', 'penalty': 'l2', 'C': 29.763514416313132}

# Apply Model To Remaining Customers

In [42]:
account_df = pd.read_csv("./csv_files/accounts_merged_all_transactions.csv")

account_transformed = preprocessor.fit_transform(account_df)

account_transform_df = pd.DataFrame(account_transformed, columns = transformed_columns)

In [43]:
account_transform_df['account_id'] = account_df['account_id']

In [44]:
account_transform_df.shape

(4500, 45)

In [45]:
loans_base_df.shape

(682, 58)

In [46]:
(~account_transform_df['account_id'].isin(loans_base_df['account_id'])).sum()

3818

In [47]:
account_df.isna().sum()

account_id                         0
district_id                        0
frequency                          0
date                               0
amount_order                     742
k_symbol_LEASING                 742
k_symbol_Other                   742
k_symbol_POJISTNE                742
k_symbol_SIPO                    742
k_symbol_UVER                    742
date_min                           0
date_max                           0
amount_sum                         0
amount_mean                        0
amount_std                         0
balance_min                        0
balance_max                        0
balance_mean                       0
balance_std                        0
type_PRIJEM_sum                    0
type_VYBER_sum                     0
type_VYDAJ_sum                     0
operation_Other_sum                0
operation_PREVOD NA UCET_sum       0
operation_PREVOD Z UCTU_sum        0
operation_VKLAD_sum                0
operation_VYBER_sum                0
o

In [48]:
account_no_loan_df = account_transform_df[(~account_transform_df['account_id'].isin(loans_base_df['account_id']))]

account_no_loan_df = account_no_loan_df.drop(columns='account_id')

account_no_loan_df.head()

Unnamed: 0,amount_sum,amount_mean,amount_std,balance_min,balance_max,balance_mean,balance_std,type_PRIJEM_sum,type_VYBER_sum,type_VYDAJ_sum,operation_Other_sum,operation_PREVOD NA UCET_sum,operation_PREVOD Z UCTU_sum,operation_VKLAD_sum,operation_VYBER_sum,operation_VYBER KARTOU_sum,k_symbol_Other_sum,k_symbol_POJISTNE_sum,k_symbol_SANKC. UROK_sum,k_symbol_SIPO_sum,k_symbol_SLUZBY_sum,k_symbol_UROK_sum,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,M,F,POPLATEK PO OBRATU,POPLATEK TYDNE,POPLATEK MESICNE,No Card,junior,classic,gold
0,-0.764355,-1.039177,-1.078609,0.241096,-1.080353,-1.363367,-1.175905,-0.780242,-0.446157,-0.763416,-0.711272,-0.324241,-0.017876,-0.791302,-0.733753,-0.3084,-0.73318,-0.177685,-0.085004,-0.094362,-0.01641,-0.711272,-0.554164,0.596904,-0.520953,-0.806169,-0.667309,-0.526598,-0.199264,-0.412591,-0.012606,-0.06899,0.426891,-0.479531,-0.4603,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.824695,-0.80301,-0.824016,0.241096,-0.498533,-0.639317,-0.311688,-0.811965,-0.446157,-0.864195,-0.943883,-0.532854,-0.382912,-0.566646,-0.77712,-0.3084,-0.765341,1.34397,-0.085004,-0.713509,-0.69543,-0.943883,-0.484618,0.742767,0.595782,-0.350477,-0.667309,0.160169,-0.902388,-0.156985,0.530235,0.438841,-0.134555,-0.446989,-0.424173,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-0.782576,-0.960475,-1.031684,0.156333,-0.967813,-0.983555,-1.005238,-0.783186,-0.446157,-0.801456,-0.786157,-0.319623,0.033382,-0.829804,-0.772303,-0.3084,-0.868751,-0.177685,-0.085004,-0.375369,-0.293048,-0.786157,-0.450415,1.297046,0.530092,0.105214,-0.667309,0.160169,-0.568531,-0.573946,0.530235,0.382416,0.68602,-0.402856,-0.397701,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-0.921149,-0.935717,-1.026603,0.071571,-1.039404,-0.763526,-1.037719,-0.924732,-0.446157,-0.954134,-1.062528,-0.707361,-0.172732,-0.830113,-0.818927,-0.3084,-0.920199,-0.177685,-0.085004,-0.545602,-0.670281,-1.062528,-0.587387,-0.511655,-0.323882,0.33306,-0.667309,-0.183214,-0.877096,-0.354533,0.150246,0.048564,0.124574,-0.47563,-0.461291,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
5,-0.559317,-0.77551,-0.88589,0.198715,-0.538143,-0.139854,-0.712401,-0.551504,-0.446157,-0.559644,0.114799,0.229405,0.367025,-0.829649,-0.681145,-0.3084,-0.856645,-0.177685,-0.085004,0.557725,0.134483,0.114799,-0.411124,-0.074066,0.464401,0.33306,1.180055,1.877088,0.063775,-0.734548,-0.175458,-0.247671,0.426891,-0.414298,-0.398628,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [49]:
y_acc_pred_val = grid_search.best_estimator_.predict(account_no_loan_df)


In [50]:
no_loans_df = account_no_loan_df[y_acc_pred_val==1]

preapp_loans_df = account_no_loan_df[y_acc_pred_val==0]

print(no_loans_df.shape)
print(preapp_loans_df.shape)


(219, 44)
(3599, 44)


In [51]:
X_zero = no_loans_df
X_one = preapp_loans_df

A = (X_one.describe().T - X_zero.describe().T)

print(A.sort_values(by ='mean', ascending=False))

                               count      mean       std        min       25%  \
balance_min                   3380.0  2.060904 -2.343362  16.087533  2.164415   
operation_PREVOD NA UCET_sum  3380.0  0.825980  0.657067   0.000000  0.234103   
k_symbol_SIPO_sum             3380.0  0.397528 -0.147118   0.000000  0.059054   
operation_VYBER KARTOU_sum    3380.0  0.296369  0.862388   0.000000  0.000000   
balance_mean                  3380.0  0.175567  0.047696   0.268151  0.033908   
operation_PREVOD Z UCTU_sum   3380.0  0.126377  0.232420   0.000000  0.000000   
POPLATEK MESICNE              3380.0  0.125725 -0.166207   0.000000  0.000000   
A13                           3380.0  0.103048  0.055520   0.000000  0.061128   
classic                       3380.0  0.102555  0.151817   0.000000  0.000000   
A12                           3380.0  0.099163  0.044559   0.000000  0.189994   
k_symbol_POJISTNE_sum         3380.0  0.084347  0.438153   0.000000  0.000000   
A9                          