In [136]:
import time
from eli5.sklearn import PermutationImportance
from eli5 import show_weights
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from functools import reduce
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from scipy.stats import skew, kurtosis
from typing import List, Optional
import optuna
import missingno as msno
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')


Описание источников данных:

* train.csv - пары "заявка - целевая переменная", для этой выборки нужно собрать признаки и обучить модель;
* test.csv - пары "заявки - прогнозное значение", для этой выборки нужно собрать признаки и построить прогнозы;
* bki.csv - данные БКИ о предыдущих кредитах клиента;
* client_profile.csv - клиентский профиль, некоторые знания, которые есть у компании о клиенте;
* payments.csv - история платежей клиента;
* applications_history.csv - история предыдущих заявок клиента.

In [137]:
DATASET_PATH_TRAIN = '/kaggle/input/geekbrains-competitive-data-analysis/train.csv'
DATASET_PATH_TEST = '/kaggle/input/geekbrains-competitive-data-analysis/test.csv'
DATASET_PATH_SAMPLE_SUBMIT = '/kaggle/input/geekbrains-competitive-data-analysis/sample_submit.csv'
DATASET_PATH_CLIENT_PROFILE = '/kaggle/input/geekbrains-competitive-data-analysis/client_profile.csv'
DATASET_PATH_BKI = '/kaggle/input/geekbrains-competitive-data-analysis/bki.csv'
DATASET_PATH_PAYMENTS = '/kaggle/input/geekbrains-competitive-data-analysis/payments.csv'
DATASET_PATH_APPLICATIONS_HISTORY = '/kaggle/input/geekbrains-competitive-data-analysis/applications_history.csv'

In [138]:
def catboost_cross_validation(params, X, y, cv, categorical= None):
   
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]}rows, {X.shape[1]} cols")
    if categorical:
        categorical = list(set(categorical) & set(X.columns))
        X[categorical] = X[categorical].astype(str)
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
        model = cb.CatBoostClassifier(**params)
        model.fit(
             x_train, y_train, categorical,
             eval_set=[(x_train, y_train), (x_valid, y_valid)]               
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold{fold+1}, Valid_score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)
        
    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds
    

In [139]:
def create_client_profile_features(X:pd.DataFrame, copy: bool=True):
    if copy:
        X = X.copy()
    X["RATIO_ANNUITY_TO_AGE"] = X["AMOUNT_ANNUITY"] / X["AGE"]
    X["RATIO_CREDIT_TO_AGE"] = X["AMOUNT_CREDIT"] / X["AGE"]
    X["RATIO_SALARY_TO_AGE"] = X["TOTAL_SALARY"] / X["AGE"]
    X["RATIO_AGE_TO_EXPERIENCE"] = X["AGE"] / X["DAYS_ON_LAST_JOB"]
    X["RATIO_CAR_TO_EXPERIENCE"] = X["OWN_CAR_AGE"] / X["DAYS_ON_LAST_JOB"]
    X["RATIO_CAR_TO_AGE"] = X["OWN_CAR_AGE"] / X["AGE"]
    X['DAYS_ON_LAST_JOB'].replace(365243, np.nan, inplace= True)
    X['NEW_CREDIT_TO_ANNUITY_RATIO'] = X['AMOUNT_CREDIT'] / X['AMOUNT_ANNUITY']
    X['NEW_STATUS'] = X['GENDER'] + X['FAMILY_STATUS'].astype(str)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]   
       
    X["espected_monthly_loss_1"] = X["EXTERNAL_SCORING_RATING_1"] * X["AMOUNT_ANNUITY"]   
   
    
    return X

In [141]:
df_train = pd.read_csv(DATASET_PATH_TRAIN, sep = ',')
df_train.head()

In [142]:
df_train['APPLICATION_NUMBER'].value_counts()

In [143]:
df_test = pd.read_csv(DATASET_PATH_TEST, sep = ',')
df_test.head()

In [144]:
df_test['APPLICATION_NUMBER'].value_counts()

In [145]:
client_profile = pd.read_csv(DATASET_PATH_CLIENT_PROFILE, sep = ',')
client_profile.head()

In [146]:
client_profile.columns

In [147]:
client_profile['APPLICATION_NUMBER'].value_counts()

In [148]:
bki = pd.read_csv(DATASET_PATH_BKI, sep = ',')
bki.head()

In [149]:
bki['APPLICATION_NUMBER'].value_counts()

In [150]:
bki.columns

In [151]:
payments = pd.read_csv(DATASET_PATH_PAYMENTS, sep = ',')
payments.head()

In [152]:
payments['APPLICATION_NUMBER'].value_counts()

In [153]:
payments.columns

In [154]:
applications_history = pd.read_csv(DATASET_PATH_APPLICATIONS_HISTORY, sep = ',')
applications_history.head()

In [155]:
applications_history['APPLICATION_NUMBER'].value_counts()

In [156]:
applications_history['APPLICATION_NUMBER'].value_counts()

In [157]:
applications_history.columns

In [158]:
print("df_train = {} rows, {} cols".format(*df_train.shape))
print("df_test = {} rows, {} cols".format(*df_test.shape))
print("client_profile = {} rows, {} cols".format(*client_profile.shape))
print("bki = {}rows, {} cols".format(*bki.shape))
print("payments = {} rows, {} cols".format(*payments.shape))
print("applications_history = {} rows, {} cols".format(*applications_history.shape))

In [159]:
data = pd.concat([df_train, df_test], axis=0)
data = data.reset_index(drop=True)
# client_profile = create_client_profile_features(client_profile)
data=data.merge(client_profile, how='left', on="APPLICATION_NUMBER")
data.head()

In [160]:
data.tail(7)

In [161]:
data.describe()

# Обзор переменных

In [162]:
data.dtypes

## Анализ числовых признаков

In [163]:
numerical_features = data.select_dtypes(include=[np.number])
print(f"count of numeric_features {numerical_features.shape[1]}")

numerical_features.columns

In [164]:
numerical_features.head()

## Дескретные признаки

In [165]:
discrete_feature = [
    feature for feature in numerical_features
    if len(data[feature].unique())<25 and feature not in ["APPLICATION_NUMBER"]
]

print(f"Discrete Variables Count: {len(discrete_feature)}")

In [166]:
discrete_feature

In [167]:
data[discrete_feature]

## Непрерывные признаки

In [168]:
continuous_feature = [
    feature for feature in numerical_features
    if feature not in discrete_feature + ["APPLICATION_NUMBER"]]

print(f"Continuous Feature Count {len(continuous_feature)}")

In [169]:
continuous_feature

In [170]:
data[continuous_feature]

## Категориальные признаки

In [171]:
categorical_features = data.select_dtypes(include=[np.object])
print(f"Categorical Feature Count {categorical_features.shape[1]}")
categorical_features.head(n=2)

In [172]:
data["APPLICATION_NUMBER"].value_counts()

Вывод: Тренировочный дата сет имеет: 110 093 строк, 3 столбца, тестовый датасет имеет 165 141 строк, 2 столбца, client_profile имеет 250000 строк, 24 столбца, bki имеет 945234 строк, 17 столбцов, payments имеет 1 023 932 строк, 8 столбцов, applications_history имеет 1 670 214 rows, 26 cols. Строк в тестовом дата сете больше чем в тренеровочном. Для тренеровки данных имеет смысл объединить данные дата сэты. Также имеет смысл объединить тренеровочный, тестовый датасеты и client_profile так, как они стуктурированны в отличие от bki, payments, applications_history, которые имеют по несколько значений APPLICATION_NUMBER. 

Объединееный дата сет имеет 30 числовых столбцов 11 из которых дескретные, 18 непрерывных и 1 APPLICATION_NUMBER несет на себе функцию порядкового номера и не имеес смыслового значения, перед обучением его нужно будет удалить. Также датасет имеет 5 категориальных признаков.

# EDA

## 1. Обзор целевой переменной

In [173]:
data['TARGET'].value_counts()

In [174]:
data['TARGET'].hist(figsize=(10, 6), bins=20, grid=False);

In [175]:
data['TARGET'].isna().sum()

In [176]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
plt.suptitle("Target Distribution Analysis", size=14)
print(f"Mean-target: {round(df_train.TARGET.mean(), 4)}")

sns.countplot(df_train.TARGET, ax=axes[0], palette="cubehelix")
sns.violinplot(df_train.TARGET, df_train.index, ax=axes[1], palette="cubehelix")
sns.stripplot(df_train.TARGET, df_train.index, jitter=True, ax=axes[1], color="black", alpha=0.05)

axes[0].set_xlabel("Target", fontsize=14)
axes[0].set_ylabel("Counts", fontsize=14)
axes[1].set_xlabel("Target", fontsize=14)
axes[1].set_ylabel("Index", fontsize=14)

Сделаем первые выводы о целевой переменной:

* мы будем решать задачи бинарной классификации с дисбалансом целевой переменной. Количество клиентов, которые не вернули кредит, намного меньше, чем клиентов, которые платят вовремя. При этом, дисбаланс не является критическим, а является вполне рабочей ситуацией. Какие-то манипуляции с данными для выравнивания баланса классов, в качестве приоритетных гипотез, рассматриваться не будут.

* Целевая переменная распределена неравномерно по индексам датафрейма. Начиная с 20 000 индекса и по 80 000, колличество 1 увеличивается.

## Обработка пропусков

In [177]:
data.isna().sum()

In [178]:
def check_missings(df: pd.DataFrame) -> pd.DataFrame:
    """
    Функция для вычисления среднего и общего числа пропусков.

    Parameters
    ----------
    df: pandas.core.DataFrame
        Набор данных для вычисления статистики.

    Returns
    -------
    result: pandas.core.DataFrame
        Датафрейм со статистикой распределения пропусков.

    """
    na = df.isnull().sum()
    result = pd.DataFrame({
        "Total": na,
        "Percent": 100*na/df.shape[0],
        "Types": df.dtypes
    })
    result = result[result["Total"] != 0]
    print(f"Total NA-values = {na.sum()}")
    return result.T

In [179]:
train_missing = check_missings(data)
train_missing.loc['Percent']

In [180]:
msno.bar(data)

In [181]:
msno.matrix(data)

Проанализировав пропуски, мы видим, что пропуски распределены равномерно. Больше всего пропусков в OWN_CAR_AGE -           72.374779, меньше всего в GENDER,CHILDRENS, TOTAL_SALARY, AMOUNT_CREDIT, AMOUNT_ANNUITY, EDUCATION_LEVEL, FAMILY_STATUS, REGION_POPULATION, AGE, DAYS_ON_LAST_JOB, FLAG_PHONE, FLAG_EMAIL - 18.718254.

## 3. Обработка выбросов

In [182]:
continuous_feature

In [183]:
for i in continuous_feature:
    plt.figure(figsize=(6, 3))

    sns.boxplot(data[i], whis=1.5)

    plt.xlabel(i)
    plt.show()
    

#     fig, axes = plt.subplots(figsize=(12, 8))
#     fig = sns.boxplot( data=i, palette="viridis")
#     fig.axis(ymin=0, ymax=800000);

In [184]:
data.loc[(data['DAYS_ON_LAST_JOB'] > 50000), 'DAYS_ON_LAST_JOB'] = data['DAYS_ON_LAST_JOB'].median()

In [185]:
data.loc[(data['DAYS_ON_LAST_JOB'] > 50000), 'DAYS_ON_LAST_JOB']

## Корреляция между числовыми признаками

In [186]:
numerical_features

In [187]:
train_correlations = numerical_features.corr()
train_correlations = train_correlations.values.flatten()
train_correlations = train_correlations[train_correlations != 1]

columns = numerical_features.columns.tolist()
columns.remove("TARGET")

plt.figure(figsize=(20,5))
sns.distplot(train_correlations, color="Blue", label="train")
plt.xlabel("Correlation values found in train (except 1)", size=14)
plt.title("Are there correlations between features?", size=14)
plt.legend(loc="best", fontsize=14)
plt.ylabel("Density", size=14)

Из данного графика видно, что линейная корреляция между признаками низкая. 

In [188]:
mask = data['TARGET'].isnull()
features_to_drop = ["APPLICATION_NUMBER", "TARGET"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = df_train['TARGET'], df_test['APPLICATION_NUMBER']
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorical = train.dtypes[train.dtypes == "object"].index.tolist()
train[categorical] = train[categorical].astype(str)
numerical = list(set(train.columns) - set(categorical))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

In [215]:
parameters = {
    "max_depth": [6, 8, 10, 15],
    "min_samples_leaf": [15, 25, 50, 100, 250]
}

model = RandomForestClassifier(
    n_estimators=100, n_jobs=6, random_state=27
)
grid = GridSearchCV(
    model,
    parameters,
    n_jobs=-1,
    verbose=2,
    scoring=make_scorer(roc_auc_score),
    cv=3,
)

In [216]:
%%time
grid.fit(
    train[numerical].fillna(train[numerical].median()),
    target
)

In [219]:
ntop = 10
importances = grid.best_estimator_.feature_importances_
idx = np.argsort(importances)[::-1][0:ntop]
feature_names = train[numerical].columns.values

plt.figure(figsize=(30,5))
sns.barplot(x=feature_names[idx], y=importances[idx], palette="viridis")
plt.title("What are the top important features to start with?", size=14)

Самыми сильными нелинейно зависимыми признаками оказались: EXTERNAL_SCORING_RATING_2, EXTERNAL_SCORING_RATING_3, EXTERNAL_SCORING_RATING_1,  AGE

## KFold

In [195]:
data = pd.concat([df_train, df_test], axis=0)
data = data.reset_index(drop=True)
client_profile = create_client_profile_features(client_profile)
data=data.merge(client_profile, how='left', on="APPLICATION_NUMBER")
mask = data['TARGET'].isnull()
features_to_drop = ["APPLICATION_NUMBER", "TARGET"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = df_train['TARGET'], df_test['APPLICATION_NUMBER']
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorical = train.dtypes[train.dtypes == "object"].index.tolist()
train[categorical] = train[categorical].astype(str)
numerical = list(set(train.columns) - set(categorical))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

In [197]:
features_to_drop = [
    'REGION_POPULATION',
    'FAMILY_SIZE',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_YEAR',
    'FLAG_PHONE',
    'FLAG_EMAIL',
    'CHILDRENS',
    'AMT_REQ_CREDIT_BUREAU_DAY',
     'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_MON',
    'TOTAL_SALARY',    
       
]

In [None]:
X=train.drop(features_to_drop, axis=1)
y=target

In [None]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=42)

    param = {
         'n_estimators': trial.suggest_int("n_estimators", 1000, 10000),
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.1),
        "eval_metric": "AUC",
        "depth": trial.suggest_int("depth", 1, 12),
       
    }

   
        
    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(train_x,
            train_y,
            categorical,
            eval_set=[(valid_x, valid_y)],
            verbose=0,
            early_stopping_rounds=100,
            )

    preds = gbm.predict_proba(valid_x.fillna(-1))[:, 1]
    roc_auc = roc_auc_score(valid_y, preds)
    return roc_auc

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

In [200]:
cb_params = {


'n_estimators': 1000,
 'eval_metric': 'AUC',
 'silent': True,
 'learning_rate': 0.06209259506553864,
 'depth': 3}



cv = KFold(n_splits=5, random_state=1, shuffle=True)
estimators, oof_preds=catboost_cross_validation(
    params=cb_params, X=train.drop(features_to_drop, axis=1), y=target, cv=cv, categorical=categorical)

In [208]:
oof_score=roc_auc_score(
target, oof_preds)
print(f"OOF-score = {round(oof_score, 5)}")

OOF-score = 0.73002

OOF-score = 0.72974








## Подготовка прогноза

In [209]:
y_pred=np.zeros(test.shape[0])
test[numerical]=test[numerical].astype(float)
test[categorical]=test[categorical].astype(str)

for estimator in estimators:
    y_pred += estimator.predict_proba(test.drop(features_to_drop, axis=1))[:, 1]

In [211]:
prediction = y_pred/len(estimators)

In [213]:
df_test = pd.read_csv(DATASET_PATH_TEST, sep = ',')
df_test["TARGET"] = prediction
df_test.drop("NAME_CONTRACT_TYPE", axis=1, inplace=True)
df_test

In [214]:
# df_test.to_csv('submit_46.csv', index=False)