In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors
import datetime

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, learning_curve
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold, GridSearchCV
import xgboost as xgb, lightgbm as lgbm, catboost as catb

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

matplotlib.rcParams.update({'font.size': 12})
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
def twohistplot(column, df, Xrangetoplot = None, Xbinstoplot = None, Yrangetoplot = None, Ybinstoplot = None, medianplot = True):
    if Yrangetoplot is None:
        if not(Xrangetoplot is None):
            Yrangetoplot = Xrangetoplot
    if Ybinstoplot is None:
        if not(Xbinstoplot is None):
            Ybinstoplot = Xbinstoplot
    X = train.loc[df['Credit Default'] == 0, column]
    Y = train.loc[df['Credit Default'] != 0, column]
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    axs[0].hist(X, bins = Xbinstoplot, density = True, range = Xrangetoplot, color = 'green', alpha = 0.3, log = False)
    axs[1].hist(Y, bins = Ybinstoplot, density = True, range = Yrangetoplot, color = 'red', alpha = 0.3, log = False)
    
    if medianplot:
        axs[0].axvline(X.mean(), color='k', linestyle='dashed', linewidth=1)
        axs[1].axvline(Y.mean(), color='k', linestyle='dashed', linewidth=1)
        axs[0].axvline(X.median(), color='k', linestyle='solid', linewidth=1)
        axs[1].axvline(Y.median(), color='k', linestyle='solid', linewidth=1)
    axs[0].set_title('Credit closed on time')
    axs[1].set_title('Credit expired')
    axs[0].set_xlabel(column)
    axs[1].set_xlabel(column)
    axs[0].grid(True)
    axs[1].grid(True)
    plt.show()
    

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [None]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1) 

In [None]:
TRAIN_DATASET_PATH = '/kaggle/input/credit-data/train (1).csv'
TEST_DATASET_PATH = '/kaggle/input/credit-data/test (1).csv'

# Загрузка данных

Data fields
* Home Ownership - домовладение
* Annual Income - годовой доход
* Years in current job - количество лет на текущем месте работы
* Tax Liens - налоговые обременения
* Number of Open Accounts - количество открытых счетов
* Years of Credit History - количество лет кредитной истории
* Maximum Open Credit - наибольший открытый кредит (максимальная сумма, которая когда-либо была доступна клиенту)
* Number of Credit Problems - количество проблем с кредитом
* Months since last delinquent - количество месяцев с последней просрочки платежа
* Bankruptcies - банкротства
* Purpose - цель кредита
* Term - срок кредита
* Current Loan Amount - текущая сумма кредита (сумма, которую еще предстоит выплатить клиенту)
* Current Credit Balance - текущий кредитный баланс (сумма, которую может тратить клиент с кредитного счета)
* Monthly Debt - ежемесячный долг
* Credit Score - баллы кредитного рейтинга
* Credit Default - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [None]:
train = pd.read_csv(TRAIN_DATASET_PATH)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.dtypes

In [None]:
train.isnull().sum()

# Data analysis

In [None]:
#temp = train.loc[]
temp = train
for item in temp.columns:
    if temp[item].dtype in ['int64', 'float64']:
        item_mean = temp[item].mean()
        item_mode = temp[item].mode()[0]
        item_median = temp[item].median()
        print(item, '\n', 'Min value: ', temp[item].min(), '\n','Mean value: ', item_mean, '\n','Mode value: ', item_mode, '\n','Median value: ', item_median, '\n','Max value: ', temp[item].max(),  '\n', '\n')
        print(item, '\n',pd.unique(train[item]),'\n','\n')
    else: 
        print(item, '\n',pd.unique(train[item]),'\n','\n')
    
temp = None

In [None]:
class DataPreprocessing:
    """Correction of the possible wrong data"""
    def __init__(self):
        self.av_X = pd.DataFrame()
        
    def home_ownership_corr(self, X):
        str_to_numbers = {'Own Home': 1, 'Home Mortgage': 2, 'Rent': 3, 'Have Mortgage': 4}
        X['Home Ownership'] = X['Home Ownership'].replace(str_to_numbers)
        X['Home Ownership'] = X['Home Ownership'].astype(int)
        return X
    
    def purpose_corr(self, X):
        str_to_numbers = {'debt consolidation': 1, 'other': 2, 'home improvements': 3, 'take a trip': 4,
                          'buy a car': 5,'small business': 6, 'business loan': 7, 'wedding': 8,
                          'educational expenses': 9, 'buy house': 10, 'medical bills': 11, 'moving': 12,
                          'major purchase': 13, 'vacation': 14, 'renewable energy': 15}
        X['Purpose'] = X['Purpose'].replace(str_to_numbers)
        X['Purpose'] = X['Purpose'].astype(int)
        return X
    
    def term_corr(self, X):
        str_to_numbers = {'Short Term': 1, 'Long Term': 2}
        X['Term'] = X['Term'].replace(str_to_numbers)
        X['Term'] = X['Term'].astype(int)
        return X
        
    def annual_income_corr(self, X):
        X.loc[X['Annual Income'].isnull(),'Annual Income'] = 0
        return X
    
    def years_in_current_job_corr(self, X):
        str_to_numbers = {'10+ years': 10, '9 years': 9, '8 years': 8, '7 years': 7, '6 years': 6,
                         '5 years': 5, '4 years': 4, '3 years': 3,'2 years': 2, '1 year': 1,
                         '< 1 year': 0}
        X['Years in current job'] = X['Years in current job'].replace(str_to_numbers)
        X.loc[X['Years in current job'].isnull(),'Years in current job'] = 0
        
        return X
    
    def months_since_last_delinquent_corr(self, X):
        X.loc[X['Months since last delinquent'].isnull(),'Months since last delinquent'] = 0
        X['Months since last delinquent'] = X['Months since last delinquent'].astype(int)
        return X
    
    def bankruptcies_corr(self, X):
        X.loc[X['Bankruptcies'].isnull(),'Bankruptcies'] = 0
        X['Bankruptcies'] = X['Bankruptcies'].astype(int)
        return X
    
    def credit_score_corr(self, X):
        X.loc[X['Credit Score']>5000, 'Credit Score'] = X.loc[X['Credit Score']>5000, 'Credit Score']/10
        X.loc[X['Credit Score'].isnull(),'Credit Score'] = 0
        X['Credit Score'] = X['Credit Score'].astype(int)
        return X
    
    def current_loan_amount_corr(self, X):
        X.loc[X['Current Loan Amount']==9.9999999e+07, 'Current Loan Amount'] = 0
        return X
    
    def maximum_open_credit_corr(self, X):
        X.loc[X['Maximum Open Credit']>=1000000000, 'Maximum Open Credit'] = X.loc[X['Maximum Open Credit']>=1000000000, 'Maximum Open Credit']/10
        return X
    
    def data_correction(self, Y):
        Y = self.home_ownership_corr(Y)
        Y = self.purpose_corr(Y)
        Y = self.term_corr(Y)
        Y = self.annual_income_corr(Y)
        Y = self.years_in_current_job_corr(Y)
        Y = self.months_since_last_delinquent_corr(Y)
        Y = self.bankruptcies_corr(Y)
        Y = self.credit_score_corr(Y)
        Y = self.current_loan_amount_corr(Y)
        Y = self.maximum_open_credit_corr(Y)
        return Y

In [None]:
datacorrection = DataPreprocessing()
train = datacorrection.data_correction(train)
#train = datacorrection.annual_income_corr(train)
#train = datacorrection.years_in_current_job_corr(train)
train.isnull().sum()


In [None]:
train.shape

In [None]:
temp = train
for item in temp.columns:
    if temp[item].dtype in ['int64', 'float64']:
        item_mean = temp[item].mean()
        item_mode = temp[item].mode()[0]
        item_median = temp[item].median()
        print(item, '\n', 'Min value: ', temp[item].min(), '\n','Mean value: ', item_mean, '\n','Mode value: ', item_mode, '\n','Median value: ', item_median, '\n','Max value: ', temp[item].max(),  '\n', '\n')
        print(item, '\n',pd.unique(train[item]),'\n','\n')
    else: 
        print(item, '\n',pd.unique(train[item]),'\n','\n')
    
temp = None

In [None]:
train.dtypes

In [None]:
plt.figure(figsize = (10,10))
dataindx = (train['Months since last delinquent'] >= 0) 
sns.scatterplot(x = train.loc[dataindx, 'Credit Score'], y = train.loc[dataindx, 'Credit Default'])
plt.grid(b = True, which = 'both')
plt.minorticks_on
plt.show()


In [None]:
twohistplot('Months since last delinquent', train, [1,118], 118)
#twohistplot(column, df, Xrangetoplot, Xbinstoplot, Yrangetoplot, Ybinstoplot)

In [None]:
twohistplot('Years in current job', train, [0,10], 10)

In [None]:
twohistplot('Number of Open Accounts', train, [0,44], 44)

In [None]:
twohistplot('Years of Credit History', train, [0,60], 60)

In [None]:
twohistplot('Maximum Open Credit', train, [0,3000000], 500)

In [None]:
twohistplot('Maximum Open Credit', train, [3000000, 10000000], 100, [3000000, 10000000], 100, False)

In [None]:
twohistplot('Number of Credit Problems', train)

In [None]:
twohistplot('Current Loan Amount', train, [1,800000], 100, [1,800000], 100)

In [None]:
train.loc[(train['Current Loan Amount']>=900000)&(train['Credit Default']==0)&(train['Maximum Open Credit']>=0),'Current Loan Amount'].count()


In [None]:
twohistplot('Current Credit Balance', train, [0,1000000], 100, [0,1000000], 100)

In [None]:
twohistplot('Monthly Debt', train, [0,100000], 100, [0,100000], 100)

In [None]:
twohistplot('Credit Score', train, [600,760], 100, [600,760], 100)

In [None]:
train.loc[(train['Credit Score']>=0)&(train['Credit Score']<=300), 'Credit Score']

In [None]:
train.dtypes

# Train and test data read

In [None]:
target_name = 'Credit Default'
train = pd.read_csv(TRAIN_DATASET_PATH)
test = pd.read_csv(TEST_DATASET_PATH)

numeric_features = ['Annual Income', 'Number of Open Accounts', 
                    'Years of Credit History', 'Maximum Open Credit', 'Number of Credit Problems', 
                    'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount', 
                    'Current Credit Balance', 'Monthly Debt', 'Credit Score']

# Data scaling

In [None]:
scaler = StandardScaler()

train_norm = train.copy()
train_norm[numeric_features] = scaler.fit_transform(train_norm[numeric_features])

train = train_norm.copy()

# Data subdivision for train and validation

In [None]:
X = train.drop(columns = target_name)
y = train[target_name]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.33, shuffle = True, random_state = 21)

In [None]:
preprocessor = DataPreprocessing()
X_train = preprocessor.data_correction(X_train)
X_valid = preprocessor.data_correction(X_valid)
test = preprocessor.data_correction(test)

# Target variable balancing

In [None]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, target_name)
    
df_balanced[target_name].value_counts()


In [None]:
X_train = df_balanced.drop(columns=target_name)
y_train = df_balanced[target_name]

# A random forest classifier.

In [None]:
#RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, 
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
#min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, 
#random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
RFclf = RandomForestClassifier(n_estimators=60, max_depth=7, random_state=21).fit(X_train, y_train)
y_train_pred = RFclf.predict(X_train)
print('Train - ',RFclf.score(X_train, y_train))
y_valid_pred = RFclf.predict(X_valid)
print('Validation - ',RFclf.score(X_valid, y_valid))

In [None]:
get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

# A Bagging classifier.

In [None]:
#BaggingClassifier(base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, 
#bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0)
Bgclf = BaggingClassifier(n_estimators=115, random_state=21).fit(X_train, y_train) 
y_train_pred = Bgclf.predict(X_train)
print('Train - ',Bgclf.score(X_train, y_train))
y_valid_pred = Bgclf.predict(X_valid)
print('Validation - ',Bgclf.score(X_valid, y_valid))

In [None]:
get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

# Gradient Boosting for classification

In [None]:
#GradientBoostingClassifier(*, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, 
#criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
#max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, 
#max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, 
#n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
GBclf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=115, random_state=21).fit(X_train, y_train) 
y_train_pred = GBclf.predict(X_train)
print('Train - ',GBclf.score(X_train, y_train))
y_valid_pred = GBclf.predict(X_valid)
print('Validation - ',GBclf.score(X_valid, y_valid))

In [None]:
get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

# Histogram-based Gradient Boosting Classification Tree.

In [None]:
#HistGradientBoostingClassifier(loss='auto', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, 
#max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, 
#monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, 
#n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None)
HGBclf = HistGradientBoostingClassifier(learning_rate=0.05, max_iter=90, max_depth=9, random_state=21).fit(X_train, y_train) 
y_train_pred = HGBclf.predict(X_train)
print('Train - ',HGBclf.score(X_train, y_train))
y_valid_pred = HGBclf.predict(X_valid)
print('Validation - ',HGBclf.score(X_valid, y_valid))

In [None]:
get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

# Light GBM

In [None]:
lgbmclf = lgbm.LGBMClassifier(random_state=21).fit(X_train, y_train)
y_train_pred = lgbmclf.predict(X_train)
print('Train - ',lgbmclf.score(X_train, y_train))
y_valid_pred = lgbmclf.predict(X_valid)
print('Validation - ',lgbmclf.score(X_valid, y_valid))

In [None]:
get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

# yandex CatBoost

In [None]:
catbclf = catb.CatBoostClassifier(silent=True, random_state=21).fit(X_train, y_train)
y_train_pred = catbclf.predict(X_train)
print('Train - ',catbclf.score(X_train, y_train))
y_valid_pred = catbclf.predict(X_valid)
print('Validation - ',catbclf.score(X_valid, y_valid))

In [None]:
get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

# Stack of estimators with a final classifier.

In [None]:
#StackingClassifier(estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, 
#passthrough=False, verbose=0)
"""Stclf = StackingClassifier([('RFclf', RFclf), ('lgbmclf', lgbmclf), ('catbclf', catbclf), ('Bgclf', Bgclf), ('GBclf', GBclf), ('HGBclf', HGBclf)]).fit(X_train, y_train) 
y_train_pred = Stclf.predict(X_train)
print('Train - ',Stclf.score(X_train, y_train))
y_valid_pred = Stclf.predict(X_valid)
print('Validation - ',Stclf.score(X_valid, y_valid))"""

In [None]:
"""get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)"""

# Soft Voting/Majority Rule classifier for unfitted estimators.

In [None]:
#VotingClassifier(estimators, *, voting='hard', weights=None, n_jobs=None, flatten_transform=True, verbose=False)
"""Vclf = StackingClassifier([('RFclf', RFclf), ('lgbmclf', lgbmclf), ('catbclf', catbclf), ('Bgclf', Bgclf), ('GBclf', GBclf), ('HGBclf', HGBclf)]).fit(X_train, y_train) 
print('Train - ',Vclf.score(X_train, y_train))
y_pred_valid = Vclf.predict(X_valid)
print('Validation - ',Vclf.score(X_valid, y_valid))"""

In [None]:
"""get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)"""

# Test predictions using the best model with max f1-score=0.53: GBclf

In [None]:
submit = pd.read_csv('/kaggle/input/credit-data/sample_submission (1).csv')
submit.head()

In [None]:
predictions = GBclf.predict(test)
predictions

In [None]:
submit['Credit Default'] = predictions
submit.head()

In [None]:
submit.to_csv('GradientBoostingClasssubmit.csv', index=False)