# Simple preprocessing pipeline

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from scipy import stats

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict, learning_curve,\
train_test_split, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor, SGDClassifier, Ridge, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector

## pipeline stuff

from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn import set_config; set_config(display='diagram')


In [3]:
def_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/defaulter_data_13364.csv", index_col=[0])
pay_df = pd.read_csv("/home/slawa/code/code-rep0/projects/data/payer_data_41940.csv", index_col=[0])
def_df['default'] = 1
pay_df['default'] = 0

df = pd.concat([def_df, pay_df])

In [4]:
y = df['default'].reset_index(drop=True)

X = df.drop(columns=['default']).reset_index(drop=True)

Discarding columns if too highly correlated with other or too many Nans

In [5]:
cat_vars = ['B_30', 
            'B_38', 
            'D_114', 
            'D_116', 
            'D_117', 
            'D_120', 
            'D_126', 
            'D_63', 
            'D_64', 
            'D_66', 
            'D_68']

drop columns if they correlate > 95% with others

In [6]:
X_corr = X.corr()

In [7]:
X_corr = X_corr.unstack().reset_index() # Unstack correlation matrix 
X_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_corr = X_corr[X_corr['feature_1'] != X_corr['feature_2']] # Remove self correlation
X_corr = X_corr.drop_duplicates(subset='correlation_all')

In [8]:
red_features = list(X_corr[abs(X_corr['correlation_all'])>=.95]['feature_1']) ## abs so we also consider the negative corrs

In [9]:
X_red = X.drop(columns=red_features) ## dropping the highly correlated columns

In [10]:
## checking whether the high correlations are gone
X_red_corr = X_red.corr()
X_red_corr = X_red_corr.unstack().reset_index() # Unstack correlation matrix 
X_red_corr.columns = ['feature_1','feature_2', 'correlation_all'] # rename columns
X_red_corr.sort_values(by="correlation_all",ascending=False, inplace=True) # sort by correlation
X_red_corr = X_red_corr[X_red_corr['feature_1'] != X_red_corr['feature_2']] # Remove self correlation
X_red_corr = X_red_corr.drop_duplicates(subset='correlation_all')

In [11]:
X_red_corr

Unnamed: 0,feature_1,feature_2,correlation_all
12045,D_74,D_58,0.927332
6471,B_13,B_12,0.921825
457,B_2,B_33,0.913250
728,S_3,S_7,0.903899
28001,D_131,D_132,0.891850
...,...,...,...
9398,B_20,B_2,-0.779728
5981,S_8,S_15,-0.783457
23888,B_39,B_17,-0.805295
11791,D_73,D_108,-0.851429


In [12]:
len(red_features) ## we removed 13 columns

13

drop columns with nans if in __both__ groups > 80% nans

In [13]:
nan_threshold= 0.8 ## adjust the hardcoded values

In [14]:
def_nans = def_df.isna().sum()/len(def_df) 

In [15]:
def_nans_80 = def_nans[def_nans >= 0.8].index

In [16]:
pay_nans = pay_df.isna().sum()/len(pay_df)

In [17]:
pay_nans_80 = pay_nans[pay_nans>=0.8].index

In [18]:
nans_80 = [feature for feature in pay_nans_80 if feature in def_nans_80]

In [19]:
## check whether features were already removed
red_features_nan = [feature for feature in nans_80 if feature not in red_features] 

In [20]:
X_red = X_red.drop(columns=red_features_nan)

In [21]:
dropped_columns = red_features + red_features_nan

Building the pipeline

In [22]:
num_vars = [feature for feature in X_red.columns[2:] if feature not in cat_vars] ## exclude dates and IDs (first two columns)
str_vars = [feature for feature in X_red.columns[2:] if not pd.api.types.is_numeric_dtype(X_red[feature])] ## columns that are not numeric at all 
red_cat_vars = [feature for feature in cat_vars if feature not in dropped_columns] ## remaining categorical variables that have no string values

mark rows in categorical columns with values of -1 as NaNs

In [23]:
#X_red[red_cat_vars] = X_red[red_cat_vars].applymap(lambda x: np.nan if x in [-1,-1.0, "-1.0", "-1"] else x)

In [24]:
def nan_imp(X): ## imputes nan values for alternative values signifying nans
    nan_list = [-1,-1.0, "-1.0", "-1"] 
    return X.applymap(lambda x: np.nan if x in nan_list else x) ## perhaps subfunctions for arrays

In [25]:
# impute mean/most frequent value for other nans (specific to group?)
# robustscale all numerical values

num_scaler = RobustScaler()
num_imputer = SimpleImputer(strategy='mean')
#num_imputer = KNNImputer(n_neighbors=2) ## KNNIMputer is computationally demanding
## should come AFTER SCALING

num_pipe = make_pipeline(num_scaler, num_imputer)

str_trans = OrdinalEncoder() # is only needed if one wants to do knnimputer

nan_trans = FunctionTransformer(nan_imp)
cat_imputer = SimpleImputer(strategy="most_frequent") ## replace with KNNimputer on one neighbour, after transforming to numericals
#cat_imputer = KNNImputer(n_neighbors=1) # introducing it did not improve performance, but is computationally demanding
cat_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') ## what happens to the old columns?
cat_pipe = make_pipeline(nan_trans, cat_imputer, cat_encoder)
str_pipe = make_pipeline(nan_trans, str_trans, cat_imputer, cat_encoder)

Beware of the Dummy trap. OneHotEncoder automatically deletes one column.

In [26]:
str_pipe

In [27]:
preprocessor = ColumnTransformer([
    ('num_pip', num_pipe, num_vars),
    ('cat_pip', cat_pipe, red_cat_vars), 
    ('str_pip', str_pipe, str_vars)],
    remainder='drop' ## all columns not in num_vars and red_cat_vars are dropped.
)

In [28]:
preprocessor

In [29]:
preprocessor.fit(X_red)

In [30]:
X_pp = pd.DataFrame(preprocessor.transform(X_red))

In [31]:
X_pp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
0,-0.432532,1.390477,0.219561,0.633174,1.052237,0.377275,0.112275,0.107377,1.289006,0.861310,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.590602,-0.024268,0.004731,85.887104,0.417020,-0.753076,0.035995,1.706369,1.245628,-0.012141,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.590359,0.365948,0.005088,85.474799,0.588708,-0.628862,0.125662,0.310612,0.391219,5.811345,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.644202,-0.027197,0.214465,0.403981,0.721292,0.464084,0.095199,0.310612,0.384427,5.817559,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.568984,0.374308,0.003306,86.646623,-0.069924,-0.861669,0.113541,0.310612,2.925453,4.838007,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55299,0.268886,-0.028833,-0.004019,-0.765108,2.473778,-0.502812,-0.050748,0.310612,0.467831,0.861310,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
55300,0.256161,-0.037582,0.002408,0.116283,0.108060,-0.137992,0.026207,0.310612,0.467831,0.861310,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
55301,0.260653,-0.016493,-0.001604,0.424582,0.080116,0.053407,-0.008005,0.310612,0.467831,0.861310,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
55302,0.255417,-0.016073,-0.004165,-0.560179,0.054235,0.670990,-0.065415,0.310612,0.467831,0.861310,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [32]:
X_red[red_cat_vars].nunique().sum() ## number of unqie values in all cat. values
## should equal the number of new onehotencoded columns - 1*number of features. The latter to avoid Dummy Trap.


43

Building the simple model and putting it into the pipe

In [33]:
mod = LogisticRegression() 
## such a model treats all rows as independent despite them being from the same person
results = cross_validate(mod, X_pp, y, cv = 5, scoring=['accuracy', 'recall', 'f1'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [34]:
results

{'fit_time': array([2.48946071, 1.93731499, 2.54018807, 2.37138557, 2.60667443]),
 'score_time': array([0.02555966, 0.02388167, 0.0252974 , 0.03677344, 0.04230881]),
 'test_accuracy': array([0.84061116, 0.84187686, 0.85200253, 0.85200253, 0.8358047 ]),
 'test_recall': array([0.63598952, 0.59259259, 0.63711186, 0.6285073 , 0.58196108]),
 'test_f1': array([0.65853186, 0.6442953 , 0.67539163, 0.67240344, 0.63134389])}

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [36]:
pp_pred_pipe = make_pipeline(preprocessor, mod)

In [37]:
pp_pred_pipe

In [None]:
pp_pred_pipe.fit(X, y)

In [36]:
import pickle

In [118]:
pickle.dump(pp_pred_pipe, open('pp_pred_pipe', 'wb'))

In [37]:
loaded_model = pickle.load(open('../pickles/pp_pred_pipe', 'rb'))

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_2,R_1,S_3,D_41,B_3,D_42,...,D_130,D_131,D_132,D_133,R_28,D_139,D_140,D_142,D_144,D_145
0,6ba461c93869797c49b0f34c29274e50915466eda02a82...,2017-04-29,0.524390,0.331352,1.009814,0.009442,0.301375,0.007879,0.025821,0.144267,...,1.002085,0.006137,,0.001802,0.002142,1.008062,0.003595,0.090184,0.004446,0.365380
1,6ea315e6f219bc513964121907331f96e5b194127ebee3...,2017-09-10,0.463367,0.003464,0.818614,0.504045,0.218091,0.001362,0.014894,0.502401,...,0.002285,0.006148,,0.007019,0.005237,0.003997,0.001457,,0.000199,0.000804
2,6f105fec0ab833183a6a7aa3e39e79a6adf88a7cabe9df...,2017-03-26,0.463461,0.093902,0.818932,0.501653,0.240602,0.002078,0.027738,,...,1.004462,0.008238,,0.006285,0.007136,0.009838,0.009414,,0.004965,0.001735
3,6f105fec0ab833183a6a7aa3e39e79a6adf88a7cabe9df...,2017-04-26,0.442675,0.002785,1.005278,0.008112,0.257985,0.008379,0.023375,,...,1.008531,0.005823,,0.001023,0.007782,0.006977,0.007455,,0.007293,0.003186
4,6f105fec0ab833183a6a7aa3e39e79a6adf88a7cabe9df...,2017-05-26,0.471713,0.095840,0.817346,0.508451,0.154248,0.000736,0.026002,,...,1.001407,0.006686,,0.003718,0.009005,0.009866,0.007082,,0.000920,0.002958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41935,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2017-11-04,0.795172,0.002406,0.810826,0.001330,0.487755,0.002804,0.002469,,...,0.007231,0.009593,,0.000734,0.008338,0.008758,0.003598,,0.007223,0.009585
41936,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2017-12-12,0.790260,0.000378,0.816546,0.006443,0.177583,0.004908,0.013492,,...,0.002935,0.001944,,0.002512,0.000008,0.006394,0.008207,,0.008766,0.005736
41937,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2018-01-18,0.791994,0.005266,0.812976,0.008232,0.173920,0.006011,0.008592,,...,0.004754,0.008680,,0.003080,0.004641,0.004916,0.009980,,0.007456,0.006827
41938,ee9e5e4581c6949e605e528fe831f61f781b82d1dc9360...,2018-02-03,0.789973,0.005363,0.810696,0.002519,0.170526,0.009572,0.000368,,...,0.006846,0.005581,,0.007156,0.009519,0.006323,0.001534,,0.000794,0.000396


In [54]:
sample = X_red.sample(1)

In [56]:
loaded_model.predict_proba(sample)[0][1] * 100

8.899608798689334

# Creating custom amex scoring metric

In [176]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    
    ## TWEAK
    y_true = pd.DataFrame(y_true.reset_index(drop=True))
    y_pred = pd.DataFrame(y_pred)
    
    y_true = y_true.rename(columns={y_true.columns[0]:'target'})
    y_pred = y_pred.rename(columns={y_pred.columns[0]:'prediction'})
    ##
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [177]:
from sklearn.metrics import make_scorer

In [178]:
amex_metric_scorer = make_scorer(amex_metric)

In [None]:
## dict of scoring metrics one might want to pass into cross validation
scorings = {'recall':'recall',
            'f1':'f1',
           'amex': amex_metric_scorer}

# Averaging preprocessed X and y

In [42]:
X_pp['customer_ID'] = X_red['customer_ID']

In [43]:
X_avg_pp = X_pp.groupby('customer_ID').mean()

In [48]:
y_ID = pd.DataFrame(y)

In [49]:
y_ID['customer_ID'] = X_red['customer_ID']

In [55]:
y_unique = y_ID.groupby('customer_ID').mean().astype(int) ## actually, this data is just in train_labels

# Trying out various models on averaged data

In [187]:
mod = LogisticRegression() 
## such a model treats all rows as independent despite them being from the same person
results = cross_validate(mod, X_avg_pp, y_unique['default'], cv = 5, scoring=scorings)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [188]:
results

{'fit_time': array([0.43967652, 0.26714873, 0.26808286, 0.2818532 , 0.9147768 ]),
 'score_time': array([0.04552531, 0.04289794, 0.03425288, 0.04941869, 0.05305338]),
 'test_recall': array([0.64473684, 0.66666667, 0.73362445, 0.6419214 , 0.62445415]),
 'test_amex': array([0.4532131 , 0.45552651, 0.45947708, 0.45125079, 0.45912858])}

In [190]:
mod = LogisticRegression(penalty='l1',solver='liblinear') 
## such a model treats all rows as independent despite them being from the same person
results = cross_validate(mod, X_avg_pp, y_unique, cv = 5, scoring=scorings)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [191]:
results

{'fit_time': array([0.95894814, 1.43016315, 0.89425135, 0.5155282 , 0.81968164]),
 'score_time': array([0.03467894, 0.05702305, 0.06101513, 0.04226947, 0.03156662]),
 'test_recall': array([0.73245614, 0.70614035, 0.75982533, 0.74235808, 0.68558952]),
 'test_f1': array([0.75565611, 0.7268623 , 0.75652174, 0.76576577, 0.71853547]),
 'test_amex': array([0.49373806, 0.48588011, 0.50026109, 0.51791194, 0.44430416])}

In [69]:
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier

In [192]:
mod_perc = Perceptron(penalty='l1')

In [193]:
results_perc = cross_validate(mod_perc, X_avg_pp, y_unique, cv = 5, scoring=scorings)
results_perc['test_recall'].mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.764778211905309

In [194]:
results

{'fit_time': array([0.95894814, 1.43016315, 0.89425135, 0.5155282 , 0.81968164]),
 'score_time': array([0.03467894, 0.05702305, 0.06101513, 0.04226947, 0.03156662]),
 'test_recall': array([0.73245614, 0.70614035, 0.75982533, 0.74235808, 0.68558952]),
 'test_f1': array([0.75565611, 0.7268623 , 0.75652174, 0.76576577, 0.71853547]),
 'test_amex': array([0.49373806, 0.48588011, 0.50026109, 0.51791194, 0.44430416])}

In [195]:
mod_agg = PassiveAggressiveClassifier()
results_agg = cross_validate(mod_agg, X_avg_pp, y_unique, cv = 5, scoring=scorings)
results_agg

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([0.05657959, 0.06310368, 0.04209256, 0.08540249, 0.05321813]),
 'score_time': array([0.07544971, 0.09906197, 0.08540058, 0.0533917 , 0.03215933]),
 'test_recall': array([0.55701754, 0.47368421, 0.6069869 , 0.82969432, 0.60262009]),
 'test_f1': array([0.6195122 , 0.53071253, 0.58280922, 0.73076923, 0.62585034]),
 'test_amex': array([0.33738642, 0.25920546, 0.3194942 , 0.40785472, 0.35720282])}

In [82]:
from sklearn.neighbors import KNeighborsClassifier

In [199]:
knn = KNeighborsClassifier(n_neighbors=5)
results_knn = cross_validate(knn, X_avg_pp, y_unique, cv = 5, scoring=scorings)
results_knn['test_recall'].mean()

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


0.47855282310579944

In [200]:
results_knn

{'fit_time': array([0.01227546, 0.00798273, 0.00844502, 0.00742483, 0.01128435]),
 'score_time': array([0.1573    , 0.14182854, 0.12138844, 0.12682533, 0.07112479]),
 'test_recall': array([0.47368421, 0.46929825, 0.51965066, 0.4628821 , 0.46724891]),
 'test_f1': array([0.5729443 , 0.57526882, 0.59649123, 0.54780362, 0.56613757]),
 'test_amex': array([0.34166025, 0.39677783, 0.34823032, 0.30214456, 0.33086334])}

In [92]:
from sklearn.ensemble import GradientBoostingClassifier

In [201]:
mod_gbc = GradientBoostingClassifier()

In [202]:
results_gbc = cross_validate(mod_gbc, X_avg_pp, y_unique, cv = 5, scoring=scorings)
results_gbc

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([14.57703304, 15.44440269, 15.4460113 , 14.91057968, 14.98634648]),
 'score_time': array([0.01776814, 0.03420639, 0.01747346, 0.0174253 , 0.01749253]),
 'test_recall': array([0.71929825, 0.73245614, 0.75545852, 0.72052402, 0.68995633]),
 'test_f1': array([0.74545455, 0.74887892, 0.74089936, 0.74324324, 0.71011236]),
 'test_amex': array([0.46076064, 0.51150484, 0.47817671, 0.50152231, 0.4371964 ])}

In [95]:
results_gbc

{'fit_time': array([15.31042814, 15.58400297, 15.10994864, 14.88649392, 14.3548708 ]),
 'score_time': array([0.00783539, 0.00701785, 0.00785375, 0.00779939, 0.0067637 ]),
 'test_accuracy': array([0.87868852, 0.87540984, 0.86775956, 0.87650273, 0.85901639]),
 'test_recall': array([0.72368421, 0.73245614, 0.75545852, 0.72052402, 0.68995633]),
 'test_f1': array([0.74829932, 0.74553571, 0.74089936, 0.74492099, 0.71011236])}