In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df = pd.read_csv('/kaggle/input/bankdefaultsinrussia/dataset.csv')

In [None]:
df.columns = ['license',
 'date',
 'net_assets',
 'ROA',
 'liquid',
 'ibl',
 'stocks',
 'bond',
 'oth_cap',
 'sunk_retail_credit',
 'NI',
 'organization_credit',
 'sunk_organization_credit',
 'credit_portf',
 'sunk_credit_portf',
 'organization_deposit',
 'retail_deposit',
 'security_tot',
 'ROE',
 'retail_credit',
 'reserv_credit_perc',
 'zalog_credit_perc',
 'foreign_na_fr',
 'retail_deposit_fr',
 'N3',
 'N2',
 'N1',
 'capital',
 'msk_spb',
 'INF_SA',
 'NX_growth',
 'micex_std',
 'miacr_std',
 'miacr_amount',
 'usd_rub_std_diff',
 'micex_return',
 'net_foreign_assets_diff',
 'net_gov_debt_diff',
 'other_fin_debt_diff',
 'retail_debt_SA_DETREND_diff',
 'stocks_capital_diff',
 'i_retail_spread_diff',
 'usd_rub_return',
 'miacr_diff',
 'default']

In [None]:
print(f'THERE ARE {df.license.nunique()} UNIQUE BANKS IN THIS DATASET')

In [None]:
defaulted = pd.DataFrame(df.groupby(['license'])['default'].max()).reset_index()
print(f'OUT OF WHICH {dict(defaulted.groupby(["default"])["license"].count())[1]} EVENTUALLY DEFAULTED')

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df['DATE'] =  pd.to_datetime(df.date)

In [None]:
df['YEAR'] = pd.DatetimeIndex(df['DATE']).year
df['MONTH'] = pd.DatetimeIndex(df['DATE']).month
df['DOF'] = pd.DatetimeIndex(df['DATE']).dayofweek

In [None]:
df[['YEAR', 'MONTH']].head()

In [None]:
df['YEARMONTH'] = df.apply(lambda row: 100 * row['YEAR'] + row['MONTH'], axis = 1)
df['YEARMONTH'].head()

In [None]:
# df.query('license == 20')['default']

In [None]:
defaulted = df[df.default == 1].license.unique()
_df = df[df.license.isin(defaulted)]
plt.scatter(_df.YEARMONTH, _df.default);

In [None]:
from collections import Counter
freq = pd.DataFrame(Counter(df.license), index = ['cnt']).T.sort_values('cnt', ascending = False).reset_index()

In [None]:
df.shape

In [None]:
freq.tail()

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20, 10
_df = df[df.license.isin([1144., 2696., 1067., 3296., 2398., 3292., 2664., 2961., 2271.,
       1948., 2995., 1411., 2103., 2093., 2649., 3265., 2645., 2609.,
        702.,  704.])].sort_values('DATE')
plt.plot(_df.DATE, _df.net_assets)
plt.title('NET ASSETS OF BANKS \nWITH MOST FREQUENT REPORTING');

In [None]:
df = df[~(df.DATE.isna()|df.license.isna())].copy(deep = True)

In [None]:
df['N_REPORTS'] = df.groupby('license')['license'].transform('count')
df['OCCURENCE'] = df.groupby(['license']).cumcount()+1

In [None]:
df[df.license == 2696.]

In [None]:
colors = ['blue', 'red']
for defaulted in [0,1]:
    sns.distplot(df[df.default == defaulted].N_REPORTS, kde = True, hist= False, bins = 100)
plt.title('WE SEE THAT THERE IS BALANCE BETWEEN\n REPORTING FREQUENCY AND DEFAULT');

In [None]:
pd.DataFrame(df.groupby('license')['OCCURENCE'].max())['OCCURENCE'].describe()

In [None]:
df.sort_values(['license', 'OCCURENCE'], inplace = True)
df['next_day'] = df.DATE.shift(-1)
df['next_id'] = df.license.shift(-1)

In [None]:
df['GAP'] = (df['next_day'] - df['DATE']).dt.days

In [None]:
df['GAP'] = df.GAP.apply(lambda x: x if x > 0 else 0)
df['GAP'].describe()

In [None]:
df['INV_OCCURENCCE'] = df.N_REPORTS - df.OCCURENCE

In [None]:
#TAKING LAST 50 VALUES
ndf = df[df.INV_OCCURENCCE < 50]

In [None]:
df[df.GAP > 0].groupby('license')['GAP'].describe()

In [None]:
cols = list(ndf)

In [None]:
ndf.head()

In [None]:
featcols = cols[cols.index('net_assets') : cols.index('default')]
tdf = ndf.groupby('license')[featcols].mean().reset_index()

In [None]:
#ASSIGNING LABELS
labels = dict(ndf.groupby('license')['default'].max())
tdf['Y'] = tdf.license.apply(lambda x: labels[x])

In [None]:
tdf.license.nunique(), tdf.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(tdf[featcols], tdf['Y'])

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train, y_train)

In [None]:
def get_scores(_yhat: np.array, _y_true: np.array, cl: str = 'CLASS_1') -> dict:

    """
    Costumizes the confusion matrix,
    and calculates recall and precision for both classes.
    """

    _bmx = confusion_matrix(_yhat, _y_true).flatten()

    _OO = _bmx[0]
    _OI = _bmx[2]
    _IO = _bmx[1]
    _II = _bmx[3]

    # FOR CLASS 1
    _per1 = _II / (_II + _OI)
    _rec1 = _II / (_II + _IO)

    # FOR CLASS 0
    _per0 = _OO / (_OO + _IO)
    _rec0 = _OO / (_OO + _OI)

    # F1 SCORES
    _f11 = 2 * _per1 * _rec1 / (_per1 + _rec1)
    _f10 = 2 * _per0 * _rec0 / (_per0 + _rec0)

    # F1 TOTAL
    _ftot = 2 * _f11 * _f10 / (_f11 + _f10)

    _cc = Counter(_y_true.flatten())
    _res = {f"PRECICION_{cl}": _per1,
            f"RECALL_{cl}": _rec1,
            f"PRECISSION_NOT_{cl}": _per0,
            f"RECALL_NOT_{cl}": _rec0,
            f"F1_{cl}": _f11,
            f"F1_NOT_{cl}": _f10,
            f"F1_ALL": _ftot,
            f"NOT_{cl}": _cc[0],
            f"{cl}": _cc[1],
            "NOT_as_NOT": _OO,
            "NOT_as_CLS": _OI,
            "CLS_as_NOT": _IO,
            "CLS_as_CLS": _II,
            f"BENCHMARK PRECISION FOR {cl}": (_II+_IO)/sum(_cc.values())}
    return _res

In [None]:
# A NICE EXAMPLE OF OVERFITTING (WE SEE THE LARGE DIFFERENCE BETWEEN TRAINI AND TEST SCORES (HENCE WE IGNORE THE dt. MODEL HEREINAFTER))
pd.DataFrame([get_scores(dt.predict(X_test), y_test.values, cl = 'DEFAULTED'), get_scores(dt.predict(X_train), y_train.values, cl = 'DEFAULTED')], index = ['TESTING', 'TRAINING']).T

In [None]:
from datetime import datetime
from sklearn.model_selection import GridSearchCV
def search_grids(X, y, clf, params_grid, cros_val=5):
    grid_search = GridSearchCV(clf,
                               param_grid=params_grid,
                               cv=None)
    start = datetime.now()
    grid_search.fit(X, y)
    end = datetime.now()
    print (f"TOOK {(end - start).seconds} SECONDS")
    return  grid_search.best_params_, grid_search.best_estimator_

In [None]:
tree_param_grid = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 4, 16],
              "max_depth": [None, 2, 4, 16],
              "min_samples_leaf": [5, 20],
              "max_leaf_nodes": [None, 5, 20],
              }

In [None]:
from pprint import pprint
#COULD BE EXTENDED WITH THE FOLLOWIG PARAMS !#EXCEPT FOR tree_
pprint(dt.__dict__, depth = 1)

In [None]:
dtc = DecisionTreeClassifier(random_state = 0)
prm, tr = search_grids(X_train, y_train, dtc, tree_param_grid, cros_val = 3)

In [None]:

pd.DataFrame([get_scores(tr.predict(X_test), y_test.values, cl = 'DEFAULTED'), get_scores(tr.predict(X_train), y_train.values, cl = 'DEFAULTED')], index = ['TESTING', 'TRAINING']).T

In [None]:
# predict the probability (instead of binary output) for later processing 
##################
#     EXAMPLE    #
##################
# rft.predict_proba(X_test)

In [None]:
test_proba = tr.predict_proba(X_test)

In [None]:
test_proba_df = pd.DataFrame(test_proba, columns = tr.classes_)

In [None]:
sns.distplot(test_proba_df[1], bins = 100, hist = False);

In [None]:
# Bagging, Boosting, Stacking

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [None]:
#EXAMPLE (SMALL) PARAMETER GRID
rf_grid = {'bootstrap': [True, False],
             'max_depth': [10, 20, None],
             'max_features': ['auto', 'sqrt'],
             'min_samples_split': [2, 20],
             'n_estimators': [50, 100]}

In [None]:
#it can be extended with the following params: documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
r = RandomForestClassifier()
r.__dict__

In [None]:
rf1 = RandomForestClassifier(random_state = 0)
prm, rf = search_grids(X_train, y_train, rf1, rf_grid, cros_val = 3)

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

In [None]:
XXtrain = X_train.copy(deep = True)
XXtrain['ycol'] = y_train.copy(deep = True)

XXtest = X_test.copy(deep = True)
XXtest['ycol'] = y_test.copy(deep = True)

In [None]:
# XXtrain.isna()

In [None]:
XXtrain.columns = [f'C{i}' for i in range(XXtrain.shape[1]-1)]+['Cycol']

In [None]:
XXtest.columns = [f'C{i}' for i in range(XXtrain.shape[1]-1)]+['Cycol']

In [None]:
XXtest

In [None]:
train = h2o.H2OFrame(python_obj=XXtrain.fillna(-1).to_dict('list'))
test = h2o.H2OFrame(python_obj=XXtest.fillna(-1).to_dict('list'))

In [None]:
# train

In [None]:

# Identify predictors and response
x = train.columns
y = "Cycol"
x.remove(y)

train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

aml = H2OAutoML(max_models=20, seed=1, exclude_algos = ["StackedEnsemble", "DeepLearning"])
aml.train(x=x, y=y, training_frame=train)

lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])

In [None]:
m = h2o.get_model([mid for mid in model_ids if "GBM_2_AutoML_20210424_063842" in mid][0])  

In [None]:
m.predict(test)

In [None]:
# et1 = ExtraTreesClassifier(random_state = 0)
# prm, et = search_grids(X_train, y_train, et1, rf_grid, cros_val = 3)

In [None]:
rft = RandomForestClassifier(random_state = 0)
rft.fit(X_train, y_train)
ett = ExtraTreesClassifier(random_state = 0)
ett.fit(X_train, y_train)

In [None]:
rf.estimators_[0]

In [None]:
# rft.estimators_

In [None]:
pd.DataFrame([
    get_scores(rf.predict(X_test), y_test.values), 
    get_scores(rft.predict(X_train), y_train.values),
    get_scores(et.predict(X_test), y_test.values), 
    get_scores(ett.predict(X_train), y_train.values),

], index = ['GRIDSEARCH RF', 'BASE PARAMS RF', 'GRIDSEARCH EXTT', 'BASE PARAMS EXTT']).T

In [None]:
#WHY? 
# GridSearchCV default metric
#SEE https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# TO UNDERSTAND WHAT YOU MIGHT NEED TO MAXIMIZE AZ A RESULT OF THE SEARCH

In [None]:
tree_param_grid = {
              "ccp_alpha": [0.1, 0.2, 0.3],
              "criterion": ["gini", "entropy"],
              "min_samples_split": [2, 4, 16, 32],
              "max_depth": [None, 16, 24, 32],
              "min_samples_leaf": [2, 5, 20, 40],
              "max_leaf_nodes": [None, 5, 20, 30],
              }

In [None]:
# ## three trees

# t_clf1 = DecisionTreeClassifier(random_state =0)
# t_clf2 = DecisionTreeClassifier(random_state =0)
# t_clf3 = DecisionTreeClassifier(random_state =0)

# ############ SIMPLE EXAMPLE ############# #SEE ARGUMENT scoring
# gsc1 = GridSearchCV(t_clf1, scoring = 'f1', 
#                     param_grid=tree_param_grid, 
#                     cv=10)

# gsc2 = GridSearchCV(t_clf2, scoring = 'average_precision', 
#                     param_grid=tree_param_grid, 
#                     cv=10)

# gsc3 = GridSearchCV(t_clf3, scoring = 'recall', 
#                     param_grid=tree_param_grid, 
#                     cv=10)

# ### FITTING THOSE

# gsc1.fit(X_train, y_train)
# gsc2.fit(X_train, y_train)
# gsc3.fit(X_train, y_train)

In [None]:
from tensorflow import keras 

In [None]:
import optuna
import pandas as pd
from sklearn import linear_model
from sklearn import ensemble
from sklearn import datasets
from sklearn import model_selection

def objective(trial):

    n_layers = trial.suggest_int("n_layers", 1, 5)
    
    model = keras.models.Sequential()
    model.add(keras.layers.RNN(x.shape[1]))
    
    for layer in n_layers:
        activation = trial.suggest_categorical("activation", ["sigmoid", "tanh", "relu"])
        dropout = trial.suggest_float('dropout', 0.1, 0.4)
        model.add(keras.layers.RNN(x.shape[1]//2, recurrent_dropout = dropout))
        
    model.add(keras.layers.Dense(2))
    model.compile()
    
#     classifier_name = trial.suggest_categorical("classifier", ["LogReg", "RandomForest"])
    
#     if classifier_name == 'LogReg':
#         logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10)
#         classifier_obj = linear_model.LogisticRegression(C=logreg_c)
#     else:
#         rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000)
#         rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32)
#         classifier_obj = ensemble.RandomForestClassifier(
#             max_depth=rf_max_depth, n_estimators=rf_n_estimators
#         )

    score = model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
study.best_params

In [None]:
rf = RandomForestClassifier(random_state = 0, n_estimators = 188, max_depth = 20)

In [None]:
rf.fit(X_train, y_train)

In [None]:
get_scores(rf.predict(X_test), y_test.values)

In [None]:
# pd.DataFrame([
#     get_scores(gsc1.best_estimator_.predict(X_test), y_test.values, 'DEAULTED'),
#     get_scores(gsc2.best_estimator_.predict(X_test), y_test.values, 'DEAULTED'),
#     get_scores(gsc3.best_estimator_.predict(X_test), y_test.values, 'DEAULTED'),], 
#     index = ['F1', 'AVG_PREC', 'RECALL']).T

In [None]:
# NOW APPARENTLY THERE IS DIFFERENCE BETWEEN THE SCORES. NOT THE WAY WE EXPECTED, 
# BUT IT NEEDS TO BE REMEMBERED THAT THE CV USES THE SCORES FOR THE TRAINING DATASET TO GET THE RESULTS, AND IT MAY NOT NECCESSARILY TRANSLATE INTO TEST DATASET

In [None]:
# HOWEVER PARAMS CHOSEN LOOK DIFFERENT

In [None]:
from sklearn.datasets import make_multilabel_classification

X, y = make_multilabel_classification(n_samples=1000, n_features=2, n_classes=3, n_labels=2, random_state=1)

# summarize dataset shape
print(X.shape, y.shape)

# summarize first few examples
for i in range(10):
	print(X[i], y[i])

In [None]:
xt = DecisionTreeClassifier()

In [None]:
xt.fit(X, y)

In [None]:
xt.predict(X)

In [None]:
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lg
from catboost import CatBoostClassifier as cat

In [None]:
xx = xgb(random_state= 0)
lgbm = lg(random_state = 0)
ctt = cat(random_state = 0)

In [None]:
xx.fit(X_train, y_train)
lgbm.fit(X_train, y_train)
ctt.fit(X_train, y_train, verbose = False)

In [None]:
pd.DataFrame([get_scores(tr.predict(X_test), y_test.values, 'DEAULTED'), 
              get_scores(rft.predict(X_test), y_test.values, 'DEAULTED'), 
              get_scores(ett.predict(X_test), y_test.values, 'DEAULTED'), 
              get_scores(xx.predict(X_test), y_test.values, 'DEAULTED'),
              get_scores(lgbm.predict(X_test), y_test.values, 'DEAULTED'),
              get_scores(ctt.predict(X_test), y_test.values, 'DEAULTED'),
             ],
             index = ['SINGLE_TREE', '100 RF', '100 EXTT', 'XGBOOST', 'LIGHTGBM', 'CAT']).T

In [None]:

cat.__dict__['__init__']

In [None]:
#NEXT POTENTIAL STEPS
#- META CLASSIFER TRAINED ON VARIOUS TIMEFRAMES OF THE DATASET
#- FEATURE TRANSFORMATION USING DOIMAIN KNOWLEDGE (RATIOS/ETC)

In [None]:
#FIN

In [None]:
#P.S ONE MORE ALGORITHM

In [None]:
# !pip install skope-rules

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from skrules import SkopeRules

In [None]:
rule = SkopeRules(random_state=0, feature_names = featcols)

In [None]:
rule.fit(X_train, y_train)
get_scores(rule.predict_top_rules(X_test, 5), y_test.values), 

In [None]:
rule.rules_[:0.6308724832214765]

In [None]:
tdf.query('INF_SA > 0.009013020433485508 and net_gov_debt_diff > -37380.984375 and other_fin_debt_diff <= 73263.51953125').groupby('Y')['license'].nunique()

In [None]:
### PRODUCTION RANDOM FOREST

In [None]:
# cols = [1,2,3]

# [3,1,2]


In [None]:
class ProdRandomForestClassifier(RandomForestClassifier):
    
    def __init__(self, n_estimators=100, criterion='gini', 
                 max_depth=None, min_samples_split=2, 
                 min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                 max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                 min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
                 random_state=None, verbose=0, warm_start=False, class_weight=None, 
                 ccp_alpha=0.0, max_samples=None):
        
        super().__init__(n_estimators, criterion, max_depth, 
                         min_samples_split, min_samples_leaf, 
                         min_weight_fraction_leaf, max_features, 
                         max_leaf_nodes, min_impurity_decrease, 
                         min_impurity_split, bootstrap, oob_score, 
                         n_jobs, random_state, verbose, warm_start, 
                         class_weight, ccp_alpha, max_samples)
        
    def preprocess_X(self, _X):
        
        """ Do transformations """
        
        ## if a col of _X is not in col, drop it,
        ## if a col is missing in _X add it with 0
        ## sort according to cols
        
        return _X 
    
    def fit(self, X, y):
        
        return super(ProdRandomForestClassifier, self).fit(self.preprocess_X(X), y)

    
    def predict(self, X):
        
        return super(ProdRandomForestClassifier, self).predict(self.preprocess_X(X))

    

In [None]:
pRF = ProdRandomForestClassifier()

In [None]:
pRF.fit(X_train, y_train)

In [None]:
pRF.predict(X_train)