In [40]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

np.random.seed(42)

# Голова модели в виде композиции обученных градиентных бустингов

Градиетные бустинги обучались независимо на задачу предсказания таргетов, каждый на своей таблице с данным

- model_a - градиентный бустинг на эмбеддингах графов
- model_c - градиентный бустинг на признаках пользователей
- model_d - градиентный бустинг на агрегированных признаках друзей

In [154]:
target_traintest = pd.read_csv("../data/FINAL_TARGETS_DATES_TRAINTEST.tsv", sep='\t')

Unnamed: 0,CLIENT_ID,RETRO_DT,TARGET
0,1011725,20210501,0
1,1018784,20210501,1
2,1021812,20210501,0
3,1024003,20210501,0
4,1025140,20210501,test


In [170]:
model_a = CatBoostClassifier().load_model('catboosts/ctb_graph.ctb')
model_c = CatBoostClassifier().load_model('catboosts/ctb_friends_features.ctb')
model_d = CatBoostClassifier().load_model('catboosts/ctb_user_features.ctb')

In [171]:
# summed graph embeddings features
graph_sequences = np.load("../data/graph_sequences.npy")
Xgs_train = graph_sequences[target_traintest['TARGET'] != 'test']
Xgs_test = graph_sequences[target_traintest['TARGET'] == 'test']


# friends features
ff_target = pd.read_csv('../data/features_friends_target.csv')
ff_test = ff_target[ff_target['TARGET'] == 'test']
ff_train = ff_target[ff_target['TARGET'] != 'test']
Xff_train, Xff_test = ff_train.drop(["TARGET"], axis=1), ff_test.drop(["TARGET"], axis=1)
y_train = ff_train["TARGET"].apply(int)


# user features
uf_target = pd.read_csv('../data/features_user_target.csv')
uf_train = uf_target[uf_target['TARGET'] != 'test']
uf_test = uf_target[uf_target['TARGET'] == 'test']
Xuf_train, Xuf_test = uf_train.drop(["TARGET"], axis=1), uf_test.drop(["TARGET"], axis=1)

Xff_train.fillna(0, inplace=True)
Xff_test.fillna(0, inplace=True)
Xuf_train.fillna(0, inplace=True)
Xuf_test.fillna(0, inplace=True)

mask = np.random.choice(Xff_train.shape[0], size=Xff_train.shape[0], replace=False)
ratio = int(0.8 * len(mask))

Xff_train, Xff_val = Xff_train.iloc[mask[:ratio]], Xff_train.iloc[mask[ratio:]]
Xuf_train, Xuf_val = Xuf_train.iloc[mask[:ratio]], Xuf_train.iloc[mask[ratio:]]
y_train, y_val = y_train.iloc[mask[:ratio]], y_train.iloc[mask[ratio:]]

Xgs_train, Xgs_val = Xgs_train[mask[:ratio]], Xgs_train[mask[ratio:]]

In [None]:
# weights for loss to comepnsate class disbalance
class_1_count = len(target_traintest[target_traintest['TARGET'] == '1'])
class_0_count = len(target_traintest[target_traintest['TARGET'] == '0'])
class_0_count, class_1_count
class_0_weight = class_1_count / (class_0_count + class_1_count)
class_1_weight = class_0_count / (class_0_count + class_1_count)
class_0_weight, class_1_weight

In [172]:
preds_a = model_a.predict_proba(Xgs_train)[:, 1]
preds_c = model_c.predict_proba(Xff_train)[:, 1]
preds_d = model_d.predict_proba(Xuf_train)[:, 1]

X_preds_train = np.stack([preds_a, preds_d, preds_c], axis=1)

clf = LogisticRegression(random_state=42, class_weight={0: class_0_weight, 1: class_1_weight}).fit(X_preds_train, y_train)

X_preds_val = np.stack([
    model_a.predict_proba(Xgs_val)[:, 1],
    model_c.predict_proba(Xff_val)[:, 1],
    model_d.predict_proba(Xuf_val)[:, 1]
], axis=1)

probs = clf.predict_proba(X_preds_val)
roc_auc_score(y_val, probs[:, 1])

0.6951701596804765

In [173]:
clf.coef_ # выученные моделью коэффициэнты

array([[ 4.36128685, 24.0825212 , 11.38729949]])

# Предсказание тестовых данных и создание submit-а

In [176]:
# lets try submit
preds_a = model_a.predict_proba(Xgs_test)[:, 1]
preds_c = model_c.predict_proba(Xff_test)[:, 1]
preds_d = model_d.predict_proba(Xuf_test)[:, 1]

preds = np.stack([preds_a, preds_c, preds_d], axis=1)
X_submit = preds
submit_probas = clf.predict_proba(X_submit)[:, 1]

In [177]:
result = target_traintest[target_traintest["TARGET"] == 'test'].reset_index(drop=True)
result["TARGET"] = submit_probas
result.drop("RETRO_DT", inplace=True, axis=1)
result.to_csv("gb3_submit.csv", index=False)

# Предсказывание на дообученных по всей выборке моделях
Для получения лучшего качества скора в хакатоне градиентные бустинги дообучаются на всех доступных метках

In [None]:
model_a_full = CatBoostClassifier().load_model('catboosts/ctb_graph_full.ctb')
model_b_full = CatBoostClassifier().load_model('catboosts/ctb_friend_features_full.ctb')

In [None]:
# get learned proba features
preds_a = model_a_full.predict_proba(Xgs_test)[:, 1]
preds_c = model_c.predict_proba(Xff_test)[:, 1]
preds_d = model_d.predict_proba(Xuf_test)[:, 1]

preds = np.stack([preds_a, preds_c, preds_d], axis=1)
X_submit = preds
submit_probas = clf.predict_proba(X_submit)[:, 1]

In [None]:
result = target_traintest[target_traintest["TARGET"] == 'test'].reset_index(drop=True)
result["TARGET"] = submit_probas
result.drop("RETRO_DT", inplace=True, axis=1)
result.to_csv("submit.csv", index=False)