In [2]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import StratifiedKFold

Num GPUs Available:  0


Считаем начальные данные, лучшие фичи, посылку с лучшим скором

In [7]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")
best_features = list(pd.read_csv("best_features_optuna.csv").Feature.values)
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()
best_predictions = pd.read_csv("best_predictions.csv")

cat_cols = [
    "channel_code",
    "city",
    "city_type",
    "okved",
    "segment",
    "start_cluster",
    "index_city_code",
    "ogrn_month",
    "ogrn_year",
]

labels = [
    "{other}",
    "{}",
    "{α, β}",
    "{α, γ}",
    "{α, δ}",
    "{α, ε, η}",
    "{α, ε, θ}",
    "{α, ε, ψ}",
    "{α, ε}",
    "{α, η}",
    "{α, θ}",
    "{α, λ}",
    "{α, μ}",
    "{α, π}",
    "{α, ψ}",
    "{α}",
    "{λ}",
]

В данных удаляются колонки с >50% пропусками, добавляются к тренировочным данным тестовые, на колторых модель уверенна, применяется обработка lda добавляющая 16 дополнительных признаков


In [8]:
def extend_test(df):
    ddf = df.sort_values(by=["id", "date"], ascending=True)
    ddf.reset_index(drop=True, inplace=True)
    mask = (ddf["date"] == "month_5") & (ddf["date"].shift(1) == "month_6")
    rows_to_insert = df[mask].copy()
    rows_to_insert["date"] = "month_4"
    ddf = pd.concat([ddf, rows_to_insert], ignore_index=True)
    ddf = ddf.sort_values(by=["id", "date"], ascending=True)
    ddf.reset_index(drop=True, inplace=True)
    return ddf


def prepare_data_for_3m(
    df_train,
    df_test,
    cat_cols,
    nan_process="0",
    drop_lot_nan_cols_thr=0.5,
):
    """
    Operations:
    1. fill na
    2. drop columns with too many nans
    3. delete bad features

    df_train: pd.DataFrame
    df_test: pd.DataFrame
    nan_process: "0" or "mean" or "median" or "999"
    drop_lot_nan_cols_thr: float, 0-1, 1 - no drop, 0 - drop all

    return: pd.DataFrame, pd.DataFrame, list

    """
    df_train["is_train"] = 1
    df_test["is_train"] = 0

    df_concat = pd.concat([df_train, df_test])
    bad_features = df_concat.columns[
        df_concat.isna().sum() > df_concat.shape[0] * drop_lot_nan_cols_thr
    ]

    if nan_process == "999":
        df_concat = df_concat.fillna(-999)

    if nan_process == "0":
        df_concat = df_concat.fillna(0)
    elif nan_process == "mean":
        df_concat = df_concat.fillna(df_concat.mean())
    elif nan_process == "median":
        df_concat = df_concat.fillna(df_concat.median())
    df_concat[cat_cols] = df_concat[cat_cols].astype("category")

    df_concat = df_concat.drop(bad_features, axis=1)
    df_train.drop("is_train", axis=1, inplace=True)
    df_test.drop("is_train", axis=1, inplace=True)

    a = df_concat[df_concat["is_train"] == 1].drop("is_train", axis=1)
    b = df_concat[df_concat["is_train"] == 0].drop(
        ["is_train", "end_cluster"], axis=1
    )

    return (
        a,
        b,
        list(set(cat_cols).intersection(a.columns)),
    )


def concatenate_months(df_train, df_test, cat_cols):
    """Concatenate data across months"""
    df_train = df_train.sort_values(by=["id", "date"], ascending=True)
    df_test = df_test.sort_values(by=["id", "date"], ascending=True)

    cat_cols_3 = (
        [f"{i}_1" for i in cat_cols]
        + [f"{i}_2" for i in cat_cols]
        + [f"{i}_3" for i in cat_cols]
    )

    concatenated_train = pd.DataFrame()
    concatenated_test = pd.DataFrame()

    concatenated_train["end_cluster"] = df_train["end_cluster"][
        df_train["date"] == "month_3"
    ].reset_index(drop=True)

    for i, month in enumerate(["month_1", "month_2", "month_3"]):
        data = (
            df_train[df_train["date"] == month]
            .drop(["date", "id", "end_cluster"], axis=1)
            .reset_index(drop=True)
        )
        data.columns = [f"{col}_{i+1}" for col in data.columns]
        concatenated_train = pd.concat([concatenated_train, data], axis=1)

    for i, month in enumerate(["month_4", "month_5", "month_6"]):

        data = (
            df_test[df_test["date"] == month]
            .drop(["date", "id"], axis=1)
            .reset_index(drop=True)
        )
        data.columns = [f"{col}_{i+1}" for col in data.columns]
        concatenated_test = pd.concat([concatenated_test, data], axis=1)

    assert concatenated_train.shape[1] == concatenated_test.shape[1] + 1
    assert (
        concatenated_train.columns[1:] == concatenated_test.columns
    ).sum() == concatenated_test.shape[1]
    return concatenated_train, concatenated_test, cat_cols_3


def extract_optuna_features(df):
    if "end_cluster" in df.columns:
        return df[list(a.columns.intersection(best_features)) + ["end_cluster"]]
    else:
        return df[list(a.columns.intersection(best_features))]


def aug_train_from_test(train, test, thr=0.9):
    sub_df = test[best_predictions.drop(["id"], axis=1).max(axis=1) > thr]
    sub_df.insert(
        1,
        "end_cluster",
        best_predictions[
            best_predictions.drop(["id"], axis=1).max(axis=1) > thr
        ]
        .drop(["id"], axis=1)
        .idxmax(axis=1),
    )

    return pd.concat([train, sub_df])


test_df = extend_test(test_df)
a, b, c = concatenate_months(train_df, test_df, cat_cols)
train_df_3m, test_df_3m, cat_cols_3m = prepare_data_for_3m(
    a, b, c, nan_process="999"
)
train_df_3m.drop("start_cluster_3", axis=1, inplace=True)
test_df_3m.drop("start_cluster_3", axis=1, inplace=True)
cat_cols_3m.remove("start_cluster_3")
# start_cluster_3 удаляется так как в тесте он почти везде пропущен

train_df_3m = aug_train_from_test(train_df_3m, test_df_3m, 0.9)

In [9]:
def add_lda(numeric_train, target, numeric_test, train, test):
    scaler = StandardScaler()
    scaler.fit(pd.concat([numeric_train, numeric_test]))
    X_train_scaled = scaler.transform(numeric_train)
    X_test_scaled = scaler.transform(numeric_test)
    lda = LinearDiscriminantAnalysis(n_components=target.unique().shape[0] - 1)
    lda.fit(X_train_scaled, target)

    train_lda = lda.transform(X_train_scaled)
    test_lda = lda.transform(X_test_scaled)

    train_lda = pd.DataFrame(
        train_lda,
        columns=[f"lda_{i}" for i in range(target.unique().shape[0] - 1)],
        index=train.index,
    )
    test_lda = pd.DataFrame(
        test_lda,
        columns=[f"lda_{i}" for i in range(target.unique().shape[0] - 1)],
        index=test.index,
    )
    new_train = pd.concat([train, train_lda], axis=1)
    new_test = pd.concat([test, test_lda], axis=1)
    return new_train, new_test

Создадим данные для обучения

In [10]:
def weighted_roc_auc(y_true, y_pred):
    global weights_dict
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(
        y_true, y_pred, labels=labels, multi_class="ovr", average=None
    )
    return sum(weights * classes_roc_auc), dict(zip(labels, classes_roc_auc))


train_df_3m_1he = pd.get_dummies(
    extract_optuna_features(train_df_3m),
    columns=list(set(cat_cols_3m).intersection(best_features)),
).replace(-999, 0)
test_df_3m_1he = pd.get_dummies(
    extract_optuna_features(test_df_3m),
    columns=list(set(cat_cols_3m).intersection(best_features)),
).replace(-999, 0)

X = train_df_3m.drop(["end_cluster"], axis=1)
X_1he = train_df_3m_1he.drop(["end_cluster"], axis=1)
X_lda, X_test_lda = add_lda(
    X_1he, train_df_3m_1he["end_cluster"], test_df_3m_1he, X, test_df_3m
)
y = train_df_3m["end_cluster"]

(
    x_train,
    x_val,
    x_train_1he,
    x_val_1he,
    x_train_lda,
    x_val_lda,
    y_train,
    y_val,
) = train_test_split(X, X_1he, X_lda, y, test_size=0.2, random_state=42)

x_train.shape, x_train_1he.shape, X_lda.shape, x_train_lda.shape

((167626, 260), (167626, 391), (209533, 276), (167626, 276))

Запустим стекинг из катбустов на 5и фолдах, для того чтобы найти выбросы в тренировочных данных

In [11]:
class StackCatBoost:
    def __init__(
        self, n_folds, n_estimators, learning_rate, max_depth, cat_features
    ):
        self.n_folds = n_folds
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.cat_features = cat_features
        self.models = []
        for i in range(n_folds):
            self.models.append(
                CatBoostClassifier(
                    n_estimators=n_estimators,
                    learning_rate=learning_rate,
                    max_depth=max_depth,
                    cat_features=cat_features,
                    verbose=0,
                    random_seed=42,
                    task_type="GPU",
                    devices="0:1",
                )
            )

    def fit(self, X, y):
        skf = StratifiedKFold(
            n_splits=self.n_folds, shuffle=True, random_state=42
        )
        self.stacking_train_preds = np.zeros((len(X), 17))
        for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
            print(f"Training on fold {fold+1}/{self.n_folds}")
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
            self.models[fold].fit(X_train, y_train)
            val_preds = self.models[fold].predict_proba(X_val)
            self.stacking_train_preds[val_index] = val_preds
        self.overall_score = weighted_roc_auc(y, self.stacking_train_preds)
        print(f"Overall validation score: {self.overall_score}")

    def predict_proba(self, X):
        preds = np.zeros((len(X), 17))
        for model in self.models:
            preds += model.predict_proba(X)
        return preds / self.n_folds


stack = StackCatBoost(5, 3000, 0.05, 3, cat_cols_3m)
stack.fit(X_lda, y)

Training on fold 1/5
Training on fold 2/5
Training on fold 3/5
Training on fold 4/5
Training on fold 5/5
Overall validation score: (0.912539298426308, {'{other}': 0.9141819795373111, '{}': 0.9193086787473701, '{α, β}': 0.8953225074292648, '{α, γ}': 0.9204913320004809, '{α, δ}': 0.9060672556613131, '{α, ε, η}': 0.9509625686473151, '{α, ε, θ}': 0.9104360247065114, '{α, ε, ψ}': 0.9385037547790268, '{α, ε}': 0.8823184655151521, '{α, η}': 0.9674370101248616, '{α, θ}': 0.8839686814709112, '{α, λ}': 0.9215138583777598, '{α, μ}': 0.8801453478772847, '{α, π}': 0.8771801382154175, '{α, ψ}': 0.9389086993363798, '{α}': 0.869776365708007, '{λ}': 0.9026180558721741})


Удалим выбросы

In [12]:
from sklearn.preprocessing import LabelEncoder
thr = 0.5
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
misses = stack.stacking_train_preds.argmax(axis=1) != label_encoder.transform(y)
clear_X_lda = X_lda.drop(
    X_lda[stack.stacking_train_preds.max(axis=1) < thr].index, axis=0
)
clear_y = y.drop(y[stack.stacking_train_preds.max(axis=1) < thr].index, axis=0)
clear_X_lda.shape

(188612, 276)

Обучим итоговую модель

In [13]:
model = CatBoostClassifier(
    cat_features=cat_cols_3m,
    iterations=3000,
    depth=4,  # 4 better 3 auto better
    learning_rate=0.05,
    task_type="GPU",
    devices="0:1",
    # loss_function="MultiClassOneVsAll",
    random_seed=42,
    # eval_metric="AUC",
)

model.fit(clear_X_lda, clear_y)

0:	learn: 2.3972303	total: 24.2ms	remaining: 1m 12s
1:	learn: 2.1588612	total: 50ms	remaining: 1m 14s
2:	learn: 1.9912567	total: 72.4ms	remaining: 1m 12s
3:	learn: 1.8608643	total: 97ms	remaining: 1m 12s
4:	learn: 1.7559524	total: 122ms	remaining: 1m 12s
5:	learn: 1.6678832	total: 147ms	remaining: 1m 13s
6:	learn: 1.5919152	total: 172ms	remaining: 1m 13s
7:	learn: 1.5273357	total: 198ms	remaining: 1m 14s
8:	learn: 1.4707392	total: 223ms	remaining: 1m 14s
9:	learn: 1.4197667	total: 248ms	remaining: 1m 14s
10:	learn: 1.3750910	total: 272ms	remaining: 1m 14s
11:	learn: 1.3351406	total: 300ms	remaining: 1m 14s
12:	learn: 1.2981989	total: 324ms	remaining: 1m 14s
13:	learn: 1.2656926	total: 349ms	remaining: 1m 14s
14:	learn: 1.2352934	total: 373ms	remaining: 1m 14s
15:	learn: 1.2077077	total: 397ms	remaining: 1m 14s
16:	learn: 1.1823706	total: 420ms	remaining: 1m 13s
17:	learn: 1.1592332	total: 443ms	remaining: 1m 13s
18:	learn: 1.1381053	total: 466ms	remaining: 1m 13s
19:	learn: 1.1182612	t

<catboost.core.CatBoostClassifier at 0x1cfdc9d95e0>

Создадим файл для отправки

In [16]:
def make_submission_3m(model):
    sample_submission_df = pd.read_csv("sample_submission.csv")
    test_df["date"] == "month_6"
    try:
        test_pred_proba = model.predict_proba(X_test_lda)
    except:
        test_pred_proba = model.predict(X_test_lda)
    test_pred_proba /= test_pred_proba.sum(axis=1).reshape(-1, 1)
    test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=labels)
    sorted_classes = sorted(test_pred_proba_df.columns.to_list())
    test_pred_proba_df = test_pred_proba_df[sorted_classes]

    sample_submission_df[sorted_classes] = test_pred_proba_df
    sample_submission_df.to_csv("vitya.csv", index=False)


make_submission_3m(model)