In [1]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 30)

## Useful Functions

In [81]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "geekbrains-competitive-data-analysis"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data


def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(x_train.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [152]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

## Base Tables

In [144]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,application_number,name_contract_type,target
0,123687442,Cash,0.0
1,123597908,Cash,1.0


## client_profile

In [145]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv: shape = 250000 rows, 24 cols


  overwrite_input=overwrite_input)


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,...,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,...,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [146]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

## baseline

In [147]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

## KFold

In [148]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=categorial
)

Tue Sep 22 19:11:16 2020, Cross-Validation, 110093 rows, 52 cols
0:	test: 0.5867378	test1: 0.5812295	best: 0.5812295 (0)	total: 39ms	remaining: 1m 17s
10:	test: 0.7001402	test1: 0.6925714	best: 0.6925714 (10)	total: 461ms	remaining: 1m 23s
20:	test: 0.7035652	test1: 0.6963165	best: 0.6963165 (20)	total: 864ms	remaining: 1m 21s
30:	test: 0.7047232	test1: 0.6979904	best: 0.6979904 (30)	total: 1.25s	remaining: 1m 19s
40:	test: 0.7054465	test1: 0.6992489	best: 0.6994170 (38)	total: 1.64s	remaining: 1m 18s
50:	test: 0.7069468	test1: 0.7006301	best: 0.7006301 (50)	total: 2.07s	remaining: 1m 19s
60:	test: 0.7081454	test1: 0.7013650	best: 0.7013650 (60)	total: 2.44s	remaining: 1m 17s
70:	test: 0.7090323	test1: 0.7018906	best: 0.7018906 (70)	total: 2.82s	remaining: 1m 16s
80:	test: 0.7095323	test1: 0.7027972	best: 0.7027972 (80)	total: 3.21s	remaining: 1m 16s
90:	test: 0.7093081	test1: 0.7030943	best: 0.7032636 (89)	total: 3.6s	remaining: 1m 15s
100:	test: 0.7099472	test1: 0.7032162	best: 0.703

910:	test: 0.7437403	test1: 0.7212308	best: 0.7213223 (909)	total: 37.3s	remaining: 44.6s
920:	test: 0.7438893	test1: 0.7211859	best: 0.7213223 (909)	total: 37.7s	remaining: 44.2s
930:	test: 0.7442164	test1: 0.7211960	best: 0.7213223 (909)	total: 38.1s	remaining: 43.8s
940:	test: 0.7444673	test1: 0.7211900	best: 0.7213223 (909)	total: 38.7s	remaining: 43.5s
950:	test: 0.7446176	test1: 0.7213102	best: 0.7213341 (944)	total: 39.1s	remaining: 43.1s
960:	test: 0.7447767	test1: 0.7213274	best: 0.7213536 (959)	total: 39.5s	remaining: 42.7s
970:	test: 0.7449901	test1: 0.7212787	best: 0.7213536 (959)	total: 39.9s	remaining: 42.3s
980:	test: 0.7452039	test1: 0.7213356	best: 0.7213536 (959)	total: 40.3s	remaining: 41.9s
990:	test: 0.7454340	test1: 0.7214053	best: 0.7214053 (990)	total: 40.7s	remaining: 41.4s
1000:	test: 0.7456573	test1: 0.7214711	best: 0.7215842 (998)	total: 41.1s	remaining: 41s
1010:	test: 0.7459362	test1: 0.7215059	best: 0.7215842 (998)	total: 41.5s	remaining: 40.6s
1020:	test

670:	test: 0.7367153	test1: 0.7252754	best: 0.7253008 (668)	total: 34.5s	remaining: 1m 8s
680:	test: 0.7370070	test1: 0.7253924	best: 0.7253924 (680)	total: 35.1s	remaining: 1m 7s
690:	test: 0.7372993	test1: 0.7255150	best: 0.7255245 (688)	total: 35.6s	remaining: 1m 7s
700:	test: 0.7376482	test1: 0.7255921	best: 0.7255921 (700)	total: 36.3s	remaining: 1m 7s
710:	test: 0.7378369	test1: 0.7255575	best: 0.7256163 (707)	total: 36.8s	remaining: 1m 6s
720:	test: 0.7380053	test1: 0.7253046	best: 0.7256163 (707)	total: 37.4s	remaining: 1m 6s
730:	test: 0.7382912	test1: 0.7256792	best: 0.7257447 (728)	total: 38.1s	remaining: 1m 6s
740:	test: 0.7386629	test1: 0.7258531	best: 0.7258531 (740)	total: 38.7s	remaining: 1m 5s
750:	test: 0.7389277	test1: 0.7260086	best: 0.7260086 (750)	total: 39.5s	remaining: 1m 5s
760:	test: 0.7391130	test1: 0.7259355	best: 0.7260086 (750)	total: 40.1s	remaining: 1m 5s
770:	test: 0.7393684	test1: 0.7260851	best: 0.7260851 (770)	total: 40.7s	remaining: 1m 4s
780:	test:

670:	test: 0.7354908	test1: 0.7292261	best: 0.7292261 (670)	total: 29.1s	remaining: 57.6s
680:	test: 0.7356919	test1: 0.7294032	best: 0.7294757 (679)	total: 29.5s	remaining: 57.1s
690:	test: 0.7360473	test1: 0.7298147	best: 0.7298304 (689)	total: 29.9s	remaining: 56.6s
700:	test: 0.7363391	test1: 0.7296541	best: 0.7298603 (691)	total: 30.4s	remaining: 56.3s
710:	test: 0.7366030	test1: 0.7296896	best: 0.7298603 (691)	total: 30.8s	remaining: 55.8s
720:	test: 0.7367948	test1: 0.7296802	best: 0.7298603 (691)	total: 31.2s	remaining: 55.3s
730:	test: 0.7370834	test1: 0.7297406	best: 0.7298603 (691)	total: 31.9s	remaining: 55.3s
740:	test: 0.7373554	test1: 0.7299262	best: 0.7299262 (740)	total: 32.5s	remaining: 55.2s
750:	test: 0.7376026	test1: 0.7295805	best: 0.7299262 (740)	total: 33s	remaining: 54.8s
760:	test: 0.7377268	test1: 0.7297522	best: 0.7299262 (740)	total: 33.4s	remaining: 54.4s
770:	test: 0.7380686	test1: 0.7297362	best: 0.7299262 (740)	total: 33.8s	remaining: 53.8s
780:	test: 0

50:	test: 0.7053054	test1: 0.7007605	best: 0.7007605 (50)	total: 2.27s	remaining: 1m 26s
60:	test: 0.7070427	test1: 0.7018562	best: 0.7019658 (59)	total: 2.71s	remaining: 1m 26s
70:	test: 0.7085961	test1: 0.7025069	best: 0.7025069 (70)	total: 3.13s	remaining: 1m 25s
80:	test: 0.7092150	test1: 0.7029684	best: 0.7029684 (80)	total: 3.61s	remaining: 1m 25s
90:	test: 0.7097728	test1: 0.7031326	best: 0.7032270 (87)	total: 4.02s	remaining: 1m 24s
100:	test: 0.7103111	test1: 0.7032649	best: 0.7033486 (96)	total: 4.4s	remaining: 1m 22s
110:	test: 0.7103708	test1: 0.7031034	best: 0.7035129 (105)	total: 4.88s	remaining: 1m 23s
120:	test: 0.7106000	test1: 0.7035103	best: 0.7035129 (105)	total: 5.29s	remaining: 1m 22s
130:	test: 0.7121750	test1: 0.7040354	best: 0.7040354 (130)	total: 5.75s	remaining: 1m 22s
140:	test: 0.7131190	test1: 0.7047516	best: 0.7047516 (140)	total: 6.18s	remaining: 1m 21s
150:	test: 0.7142429	test1: 0.7052819	best: 0.7054496 (148)	total: 6.66s	remaining: 1m 21s
160:	test: 

970:	test: 0.7441174	test1: 0.7201158	best: 0.7201716 (949)	total: 41.5s	remaining: 44s
980:	test: 0.7442351	test1: 0.7201732	best: 0.7202780 (978)	total: 41.9s	remaining: 43.5s
990:	test: 0.7443714	test1: 0.7202290	best: 0.7202780 (978)	total: 42.3s	remaining: 43s
1000:	test: 0.7446108	test1: 0.7202334	best: 0.7203413 (993)	total: 42.7s	remaining: 42.6s
1010:	test: 0.7449463	test1: 0.7202366	best: 0.7203413 (993)	total: 43.1s	remaining: 42.1s
1020:	test: 0.7450313	test1: 0.7202386	best: 0.7203802 (1018)	total: 43.4s	remaining: 41.7s
1030:	test: 0.7451084	test1: 0.7201846	best: 0.7203835 (1026)	total: 43.8s	remaining: 41.2s
1040:	test: 0.7453075	test1: 0.7202619	best: 0.7203835 (1026)	total: 44.2s	remaining: 40.7s
1050:	test: 0.7454887	test1: 0.7204104	best: 0.7205286 (1048)	total: 44.6s	remaining: 40.3s
1060:	test: 0.7456734	test1: 0.7203883	best: 0.7205286 (1048)	total: 45s	remaining: 39.9s
1070:	test: 0.7458104	test1: 0.7204701	best: 0.7205286 (1048)	total: 45.5s	remaining: 39.5s
10

770:	test: 0.7385541	test1: 0.7245926	best: 0.7245926 (770)	total: 31.6s	remaining: 50.3s
780:	test: 0.7388865	test1: 0.7244511	best: 0.7245926 (770)	total: 31.9s	remaining: 49.9s
790:	test: 0.7391762	test1: 0.7245447	best: 0.7245926 (770)	total: 32.3s	remaining: 49.4s
800:	test: 0.7394589	test1: 0.7245659	best: 0.7246343 (791)	total: 32.7s	remaining: 48.9s
810:	test: 0.7397353	test1: 0.7245585	best: 0.7246343 (791)	total: 33.1s	remaining: 48.5s
820:	test: 0.7399488	test1: 0.7246681	best: 0.7247389 (817)	total: 33.5s	remaining: 48.1s
830:	test: 0.7402586	test1: 0.7248297	best: 0.7248297 (830)	total: 33.9s	remaining: 47.6s
840:	test: 0.7403591	test1: 0.7247592	best: 0.7248297 (830)	total: 34.3s	remaining: 47.2s
850:	test: 0.7405282	test1: 0.7246789	best: 0.7248297 (830)	total: 34.7s	remaining: 46.8s
860:	test: 0.7407074	test1: 0.7248225	best: 0.7248297 (830)	total: 35s	remaining: 46.3s
870:	test: 0.7409808	test1: 0.7248437	best: 0.7249032 (864)	total: 35.4s	remaining: 45.9s
880:	test: 0

In [149]:
oof_score = roc_auc_score(
    target, oof_preds
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72481


## Подготовка прогноза

In [154]:
y_pred = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

for estimator in estimators:
    y_pred += estimator.predict_proba(test)[:, 1]

In [156]:
 = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / cv.n_splits
})
y_pred.to_csv("./geekbrains-competitive-data-analysis/baseline_submit.csv", index=False)

Unnamed: 0,APPLICATION_NUMBER,TARGET
110093,123724268,0.056370
110094,123456549,0.222611
110095,123428178,0.185588
110096,123619984,0.084373
110097,123671104,0.020926
...,...,...
275229,123487967,0.084373
275230,123536402,0.046302
275231,123718238,0.084373
275232,123631557,0.019437
