In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

df_train = pd.read_excel("train.xlsx")
df_test = pd.read_excel("test.xlsx")

In [None]:
def prepare_df(df):
    df = df.copy()
    df = df.drop(["Unnamed: 0", "№ брони"], axis=1)
    df["Дата бронирования"] = pd.to_datetime(df["Дата бронирования"])
    df["Заезд"] = pd.to_datetime(df["Заезд"])
    df["Выезд"] = pd.to_datetime(df["Выезд"])
    return df


def create_date_features(df, prefix):
    """
    prefix - название колонки для которой делать признаки
    """
    df = df.copy()
    df[prefix + "_month"] = df[prefix].dt.month.astype("int8")
    df[prefix + "_day_of_month"] = df[prefix].dt.day.astype("int8")
    df[prefix + "_day_of_year"] = df[prefix].dt.dayofyear.astype("int16")
    df[prefix + "_week_of_month"] = (
        df[prefix].apply(lambda d: (d.day - 1) // 7 + 1)
    ).astype("int8")
    df[prefix + "_week_of_year"] = (df[prefix].dt.isocalendar().week).astype("int8")
    df[prefix + "_day_of_week"] = (df[prefix].dt.dayofweek + 1).astype("int8")
    df[prefix + "_year"] = df[prefix].dt.year.astype("int32")
    df[prefix + "_is_wknd"] = (df[prefix].dt.weekday // 4).astype("int8")
    df[prefix + "_season"] = np.where(df[prefix + "_month"].isin([12, 1, 2]), 0, 1)
    df[prefix + "_season"] = np.where(
        df[prefix + "_month"].isin([6, 7, 8]), 2, df[prefix + "_season"]
    )
    df[prefix + "_season"] = pd.Series(
        np.where(df[prefix + "_month"].isin([9, 10, 11]), 3, df[prefix + "_season"])
    ).astype("int8")
    return df


def create_diff_features(df, prefix1, prefix2):
    df = df.copy()
    df[prefix1 + "_" + prefix2 + "_diff_in_days"] = (df[prefix1] - df[prefix2]).dt.days
    df[prefix1 + "_" + prefix2 + "_diff_in_weeks"] = (
        df[prefix1 + "_" + prefix2 + "_diff_in_days"] / 7
    )
    df[prefix1 + "_" + prefix2 + "_diff_in_hours"] = (
        df[prefix1] - df[prefix2]
    ).dt.total_seconds() / 3600
    return df


def create_payment_method_features(df):
    df = df.copy()
    df["SberPay"] = df["Способ оплаты"].apply(lambda x: int("SberPay" in x))
    df["Yandex Pay"] = df["Способ оплаты"].apply(lambda x: int("Yandex Pay" in x))
    df["МИР"] = df["Способ оплаты"].apply(lambda x: int("МИР" in x))
    df["ComfortBooking"] = df["Способ оплаты"].apply(lambda x: int("ComfortBooking" in x))
    df["TravelLine Pro"] = df["Способ оплаты"].apply(lambda x: int("TravelLine Pro" in x))
    df["Банк Россия"] = df["Способ оплаты"].apply(lambda x: int("Банк Россия" in x))
    df["Внешняя система оплаты"] = df["Способ оплаты"].apply(
        lambda x: int("Внешняя система оплаты" in x)
    )
    df["Банковская карта"] = df["Способ оплаты"].apply(
        lambda x: int(
            "Банковская карта" in x
            or "Банк. карта".lower() in x.lower()
            or "банковской картой".lower() in x.lower()
        )
    )
    df["Оплата наличными"] = df["Способ оплаты"].apply(
        lambda x: int("Оплата наличными" in x)
    )
    df["С предоплатой"] = df["Способ оплаты"].apply(lambda x: int("С предоплатой" in x))
    df["СБП"] = df["Способ оплаты"].apply(lambda x: int("Система быстрых платежей" in x))
    return df


def clear_source_column(text):
    mapping = [
        "Официальный сайт",
        "Бронирование из экстранета",
        "Яндекс.Путешествия",
        "ostrovok",
        "booking",
        "Программа лояльности",
        "Bronevik",
        "OneTwoTrip",
    ]

    for map_ in mapping:
        if map_.lower() in text.lower():
            return map_
    return "other"


def clear_category(text):
    if "\n" in text:
        text = text.split("\n")
        text = text[0]
    text = text.strip("1. ")
    return text

In [None]:
df_train = prepare_df(df_train)
df_train = create_date_features(df_train, "Дата бронирования")
df_train = create_date_features(df_train, "Заезд")
df_train = create_date_features(df_train, "Выезд")
df_train = create_diff_features(df_train, "Заезд", "Дата бронирования")
df_train = create_diff_features(df_train, "Выезд", "Заезд")

df_train = create_payment_method_features(df_train)

df_train["Источник"] = df_train["Источник"].apply(clear_source_column)
df_train["Категория номера"] = df_train["Категория номера"].apply(clear_category)


df_train = df_train.drop(["Дата бронирования", "Заезд", "Выезд"], axis=1)


# other features

df_train["Стоимость за ночь"] = df_train.apply(
    lambda x: x["Стоимость"] / x["Ночей"], axis=1
)

In [None]:
df_train["target"] = df_train["Дата отмены"].apply(lambda x: int(pd.isna(x)))
df_train = df_train.drop(["Дата отмены", "Статус брони"], axis=1)

In [None]:
X_train = df_train.drop("target", axis=1)
y_train = df_train["target"]

In [None]:
scores = cross_val_score(
    CatBoostClassifier(
        cat_features=["Способ оплаты", "Источник", "Категория номера"], verbose=0
    ),
    X_train,
    y_train,
    cv=StratifiedKFold(),
    n_jobs=5,
    scoring="roc_auc",
)
scores.mean()

0.8568531848233123

In [None]:
model = CatBoostClassifier(cat_features=["Способ оплаты", "Источник", "Категория номера"])
model.fit(X_train, y_train)

Learning rate set to 0.04153
0:	learn: 0.6520870	total: 69.2ms	remaining: 1m 9s
1:	learn: 0.6145599	total: 84.1ms	remaining: 42s
2:	learn: 0.5802388	total: 103ms	remaining: 34.1s
3:	learn: 0.5520985	total: 117ms	remaining: 29.2s
4:	learn: 0.5255038	total: 135ms	remaining: 26.8s
5:	learn: 0.5032544	total: 148ms	remaining: 24.5s
6:	learn: 0.4837115	total: 164ms	remaining: 23.3s
7:	learn: 0.4663103	total: 179ms	remaining: 22.2s
8:	learn: 0.4502047	total: 195ms	remaining: 21.4s
9:	learn: 0.4349404	total: 211ms	remaining: 20.9s
10:	learn: 0.4224598	total: 226ms	remaining: 20.3s
11:	learn: 0.4100825	total: 241ms	remaining: 19.8s
12:	learn: 0.4002770	total: 258ms	remaining: 19.6s
13:	learn: 0.3916483	total: 275ms	remaining: 19.4s
14:	learn: 0.3835698	total: 294ms	remaining: 19.3s
15:	learn: 0.3752340	total: 313ms	remaining: 19.3s
16:	learn: 0.3692410	total: 343ms	remaining: 19.9s
17:	learn: 0.3623313	total: 362ms	remaining: 19.7s
18:	learn: 0.3568621	total: 375ms	remaining: 19.4s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x75025d848730>

In [None]:
pd.DataFrame(zip(X_train, model.feature_importances_)).sort_values(1, ascending=False)

Unnamed: 0,0,1
2,Внесена предоплата,38.939505
47,Банк Россия,8.701483
3,Способ оплаты,7.091679
38,Заезд_Дата бронирования_diff_in_hours,4.992083
53,Стоимость за ночь,4.235243
1,Стоимость,3.099267
37,Заезд_Дата бронирования_diff_in_weeks,3.072218
36,Заезд_Дата бронирования_diff_in_days,2.532103
5,Источник,2.466568
10,Дата бронирования_day_of_month,2.311244


In [None]:
df_test = prepare_df(df_test)
df_test = create_date_features(df_test, "Дата бронирования")
df_test = create_date_features(df_test, "Заезд")
df_test = create_date_features(df_test, "Выезд")
df_test = create_diff_features(df_test, "Заезд", "Дата бронирования")
df_test = create_diff_features(df_test, "Выезд", "Заезд")

df_test = create_payment_method_features(df_test)

df_test["Источник"] = df_test["Источник"].apply(clear_source_column)
df_test["Категория номера"] = df_test["Категория номера"].apply(clear_category)


df_test = df_test.drop(["Дата бронирования", "Заезд", "Выезд"], axis=1)


# other features

df_test["Стоимость за ночь"] = df_test.apply(
    lambda x: x["Стоимость"] / x["Ночей"], axis=1
)

In [None]:
pred = model.predict_proba(df_test)[:, 1]
submissiom = pd.DataFrame(pred)
submissiom.to_csv("default_catboost_with_new_features.csv", index=False, header=False)