## Урок 7. Тюнинг гиперпараметров, построение ансамблей алгоритмов.

In [1]:
# !pip install catboost

In [2]:
# !pip install --upgrade --force-reinstall --no-deps kaggle

In [3]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.stats import gmean, rankdata
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold

In [4]:
# !mkdir -p ~/.kaggle
# !cp /content/kaggle.json ~/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle competitions download -c geekbrains-competitive-data-analysis

In [5]:
# !kaggle -v

In [6]:
# !unzip -q /content/geekbrains-competitive-data-analysis.zip -d /content/data

## Useful Functions

In [7]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "data"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data

def lightgbm_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    if not categorical:
        categorical = "auto"

    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="auc", verbose=50, early_stopping_rounds=5000,
            categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def xgboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros(X.shape[0])

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        dtrain = xgb.DMatrix(x_train, y_train)
        dvalid = xgb.DMatrix(x_valid, y_valid)

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            maximize=True,
            num_boost_round=10000,
            early_stopping_rounds=50,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            verbose_eval=10,
        )
        oof_preds[valid_idx] = model.predict(dvalid)
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


## Prepare data

In [28]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3
    # X["log_external_scoring_rating_1"] = np.log(X["external_scoring_rating_1"])
    # X["log_external_scoring_rating_2"] = np.log(X["external_scoring_rating_2"])
    # X["log_external_scoring_rating_3"] = np.log(X["external_scoring_rating_3"])

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]

    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    # X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    # X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    # X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    # X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    # X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    # X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    X["log_total_salary"] = np.log(X["total_salary"])
    X["log_amount_credit"] = np.log(X["amount_credit"])
    X["log_amount_credity"] = np.log(X["amount_annuity"])

    

    return X

def append_to_data(data: pd.DataFrame, X: pd.DataFrame, groupby: str="application_number"):
    X = X.replace(365243, np.nan)
    X = X.groupby(groupby).mean()

    data = data.merge(X, how="left", on=groupby)

    return data


train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

data = data.merge(
    client_profile, how="left", on="application_number"
)
data.shape



train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols
client_profile.csv: shape = 250000 rows, 24 cols


  overwrite_input=overwrite_input)


(275234, 52)

In [29]:
categorial = data.dtypes[data.dtypes == "object"].index
numerical = list(set(data.columns) - set(categorial))
data[numerical] = data[numerical].astype(float)
data[categorial] = data[categorial].astype('category')

data = pd.get_dummies(data)

mask = data["target"].isnull()
train, test = data.loc[~mask], data.loc[mask]

target = train["target"]
test_id = test["application_number"]
train = train.drop(["application_number", "target"], axis=1)
test = test.drop(["application_number", "target"], axis=1)

data.head(3)

Unnamed: 0,application_number,target,childrens,total_salary,amount_credit,amount_annuity,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,total_salary_net,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,log_total_salary,log_amount_credit,log_amount_credity,name_contract_type_Cash,name_contract_type_Credit Card,gender_F,gender_M,gender_XNA,education_level_Academic degree,education_level_Higher education,education_level_Incomplete higher,education_level_Lower secondary,education_level_Secondary / secondary special,family_status_Civil marriage,family_status_Married,family_status_Separated,family_status_Single / not married,family_status_Unknown,family_status_Widow
0,123687442.0,0.0,1.0,157500.0,855000.0,25128.0,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,0.700784,0.645914,0.71657,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.324353,4.197193,0.645914,0.71657,0.687756,0.700784,0.000917,34.025788,0.159543,5.428571,132372.0,1.59766,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,11.967181,13.658857,10.131738,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0
1,123597908.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,123526683.0,0.0,0.0,135000.0,1006920.0,42660.0,0.026392,21557.0,3618.0,,1.0,0.0,2.0,,0.682149,0.267869,0.0,0.0,0.0,7.0,0.0,4.0,11.0,0.539379,,,0.267869,0.682149,0.475009,0.475009,0.042907,23.603376,0.316,7.458667,92340.0,1.97894,46.709653,6.262467,37.313433,278.308458,11.791045,5.958264,3562.92,,,11.81303,13.822407,10.661017,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0


In [10]:
# categorial = data.dtypes[data.dtypes == "object"].index
# numerical = list(set(data.columns) - set(categorial))

# # data = data.reset_index(drop=True)
# data = pd.get_dummies(data, columns=categorial)
# mask = data["target"].isnull()
# features_to_drop = ["application_number", "target"]

# train, test = data.loc[~mask], data.loc[mask]

# target, test_id = train["target"], test["application_number"]
# train = train.drop(features_to_drop, axis=1)
# test = test.drop(features_to_drop, axis=1)

# train = train.replace(np.inf, np.nan)
# train = train.replace(-np.inf, np.nan)


# # categorical = [col for col in data.columns if col.endswith("_cat")]

# # train = pd.concat([data, test], axis=0).reset_index(drop=True)
# # train = pd.get_dummies(train, columns=categorical)

# # data_ohe = train.loc[:data.shape[0]-1]
# # test_ohe = train.loc[data.shape[0]:]

In [11]:
payments = get_input("payments.csv")
# payments['day_instalment_payment_ratio'] = payments["days_instalment"]/payments['days_entry_payment']
# payments['installment_payment_ratio'] = payments["amt_instalment"]/payments['amt_payment']

# payments["pay_log_days_entry_payment"] = np.log(payments["days_entry_payment"])
# payments["pay_log_amt_instalment"] = np.log(payments["amt_instalment"])
# payments["pay_log_amt_payment"] = np.log(payments["amt_payment"])

bki = get_input("bki.csv")
# bki['day_fact_enddate'] = bki['days_enddate_fact'] / bki['days_credit_enddate']
# bki['dlr'] = (bki['amt_credit_sum_debt']+0)/(bki['amt_credit_sum']+1) 
# bki['ratio_overdue'] = (bki['amt_credit_max_overdue']+0)/(bki['amt_credit_sum']+1)

appl_history = get_input("applications_history.csv")
# appl_history['count_history'] = appl_history.groupby(['application_number'])['prev_application_number'].count()

data = append_to_data(data, payments)
data = append_to_data(data, bki)
data = append_to_data(data, appl_history)

payments.csv: shape = 1023932 rows, 8 cols
bki.csv: shape = 945234 rows, 17 cols
applications_history.csv: shape = 1670214 rows, 26 cols


Чтобы было больше времени на выполнение курсовой работы, задание выполнить на наборе данных для соревнования:

Тестовая выборка - это выборка для применения модели и загрузки на ЛБ.

1. Обучить алгоритмы LightGBM и XGBoost, получить OOF прогнозы, оценить корреляцию прогнозов на обучающей выборке. Применить модели на тестовую выборку и оценить корреляцию.


In [12]:
n_estimators = 50
early_stop = 50
random_state = 42
eta = 0.3

In [14]:
xgb_params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eval_metric": "auc",
    "eta": "0.03",
    "max_depth": 6,
    "gamma": 10,
    "subsample": 0.85,
    "colsample_bytree": 0.7,
    "colsample_bylevel": 0.632,
    "min_child_weight": 30,
    "early_stopping_rounds": 50,
    "alpha": 0,
    "lambda": 0,
    "nthread": 6,
    "random_seed": 42
}

xgb_cv = KFold(n_splits=3, random_state=42, shuffle=True)


xgb_estimators, _, xgb_oof = xgboost_cross_validation(
    params=xgb_params, X=train, y=target, cv=xgb_cv
)


Sun Dec 13 13:35:49 2020, Cross-Validation, 110093 rows, 62 cols
[0]	train-auc:0.684393	valid-auc:0.692691
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[10]	train-auc:0.699872	valid-auc:0.705969
[20]	train-auc:0.704736	valid-auc:0.709873
[30]	train-auc:0.707218	valid-auc:0.710996
[40]	train-auc:0.709543	valid-auc:0.712231
[50]	train-auc:0.712078	valid-auc:0.714118
[60]	train-auc:0.71541	valid-auc:0.716593
[70]	train-auc:0.717786	valid-auc:0.718024
[80]	train-auc:0.720623	valid-auc:0.720064
[90]	train-auc:0.723435	valid-auc:0.721546
[100]	train-auc:0.725827	valid-auc:0.72327
[110]	train-auc:0.727891	valid-auc:0.724637
[120]	train-auc:0.730002	valid-auc:0.726421
[130]	train-auc:0.731796	valid-auc:0.727431
[140]	train-auc:0.734056	valid-auc:0.728895
[150]	train-auc:0.735822	valid-auc:0.730124
[160]	train-auc:0.737059	valid-auc:0.730896
[170]	train-auc:0.738987	valid-auc:0.731697
[180]	train-a

In [20]:
oof_score = roc_auc_score(
    target, xgb_oof
)
print(f"OOF-score = {round(oof_score, 5)}")

OOF-score = 0.72801


In [17]:
categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

In [25]:
y_pred = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

dtest = xgb.DMatrix(data=test)

for estimator in xgb_estimators:
    y_pred += estimator.predict(dtest)

In [30]:
y_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / xgb_cv.n_splits
})
y_pred.to_csv("baseline_submit_II.csv", index=False)

2. Усреднить прогнозы с помощью арифмитического среднего, геометрического среднего и усреднить ранги, сделать выводы о качестве отдельных моделей и о качестве комбинации.

In [31]:
ddd = pd.read_csv("baseline_submit_II.csv")

In [36]:
ddd["APPLICATION_NUMBER"] = ddd["APPLICATION_NUMBER"].astype(int)

In [39]:
ddd.to_csv("baseline_submit_II.csv", index=False)

3. Обучить CatBoost, получить OOF прогнозы и выполнить задание 1 для трех моделей.


4. Выполнить задание 2 для трех моделей.


5. (опция) Объединить OOF-прогнозы для трех моделей и обучить алгоритм Логистической регрессии (и любой другой, на ваше усмотрение). Сделать выводы о достигаемом качестве, сравнить достигаемое качество с качеством отдельных моделей и моделей, полученных в п.2 и п.4.

6. (опция) Обучить алгоритмRandomForest (желательно подтюнить параметры) и добавить к построенным ранее моделям. Выполнить задание 5.

