In [None]:
# Импортируем все необходимые библиотеки

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv
from catboost import CatBoostClassifier

In [None]:
# Загружаем переменные окружения из .env файла
load_dotenv()

In [None]:
# Создаем URL для SQLAlchemy
SQLALCHEMY_DATABASE_URL = (
    f"postgresql://"
    f"{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@"
    f"{os.getenv('POSTGRES_HOST')}:{os.getenv('POSTGRES_PORT')}/"
    f"{os.getenv('POSTGRES_DATABASE')}"
)
engine = create_engine(SQLALCHEMY_DATABASE_URL)

In [None]:
### Данные по пользователям с нашими признаками

user_info = pd.read_sql(
    """SELECT * FROM users_info_features_ruslan_prashchurovich""", con=engine
)

user_info.head()

In [None]:
### Посты и топики с нашими признаками

posts_info = pd.read_sql(
    """SELECT * FROM posts_info_features_ruslan_prashchurovich""", con=engine
)

posts_info.head()

In [None]:
### Попробуем обучить, скажем, на 5 миллионах строк (таблица слишком большая: более 70 млн строк)

feed_data = pd.read_sql(
    """SELECT * FROM feed_data WHERE action = 'view' LIMIT 500""", con=engine
)

feed_data.head()

In [None]:
### Воспроизведем датафрейм со всеми новыми фичами
df = pd.merge(feed_data, posts_info, on="post_id", how="left")
df = pd.merge(df, user_info, on="user_id", how="left")

df.head()

In [None]:
### А еще научимся выделять признаки из даты
df["hour"] = pd.to_datetime(df["timestamp"]).apply(lambda x: x.hour)
df["month"] = pd.to_datetime(df["timestamp"]).apply(lambda x: x.month)

# Циклическое кодирование
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

df.head()

In [None]:
### Уберем все ненужные колонки

df = df.drop(
    columns=[
        #    'timestamp',  ### timestamp пока оставим
        "action",
        "text",
    ],
    axis=1,
)

In [None]:
# Почистим переменные

del user_info
del posts_info
del feed_data

import gc

gc.collect()

In [None]:
### За отсечку возьмем 2021-12-10

X_train = df[df.timestamp < "2021-12-10"].drop(
    columns=["timestamp", "target", "user_id", "post_id"], axis=1
)
X_test = df[df.timestamp >= "2021-12-10"].drop(
    columns=["timestamp", "target", "user_id", "post_id"], axis=1
)

y_train = df[df.timestamp < "2021-12-10"]["target"]
y_test = df[df.timestamp >= "2021-12-10"]["target"]

y_train.shape, y_test.shape

In [None]:
object_cols = [
    "topic",
    "TextCluster",
    "gender",
    "country",
    "city",
    "exp_group",
    "hour",
    "month",
    "favorite_topic",
    "os",
    "source",
    "age_category",
    "favorite_hour",
]

In [None]:
### Теперь обучим катбуст!

seed = 0
catboost = CatBoostClassifier(
    iterations=150,
    learning_rate=0.1,
    depth=5,
    cat_features=object_cols,
    verbose=0,
    random_state=SEED,
)

catboost.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=50)

In [None]:
### Замерим качество работы такой модели
### Возьмем ROC-AUC
from sklearn.metrics import roc_auc_score

print(
    f"Качество на трейне: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:, 1])}"
)
print(
    f"Качество на тесте: {roc_auc_score(y_test, catboost.predict_proba(X_test)[:, 1])}"
)

In [None]:
### Из любопытства посмотрим на feature_importance

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


def plot_feature_importance(importance, names, model_type):

    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data = {"feature_names": feature_names, "feature_importance": feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=["feature_importance"], ascending=False, inplace=True)

    # Define size of bar plot
    plt.figure(figsize=(10, 8))
    # Plot Searborn bar chart
    sns.barplot(x=fi_df["feature_importance"], y=fi_df["feature_names"])
    # Add chart labels
    plt.title(model_type + "FEATURE IMPORTANCE")
    plt.xlabel("FEATURE IMPORTANCE")
    plt.ylabel("FEATURE NAMES")


plot_feature_importance(catboost.feature_importances_, X_train.columns, "catboost")

In [None]:
### Сохраним модель
catboost.save_model("catboost_model", format="cbm")