In [40]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

np.random.seed(42)

In [131]:
model_a = CatBoostClassifier().load_model('catboosts/ctb_graph.ctb')
model_b = CatBoostClassifier().load_model('catboosts/ctb_friend_features.ctb')

In [169]:
target_traintest = pd.read_csv("../data/FINAL_TARGETS_DATES_TRAINTEST.tsv", sep='\t')

In [132]:
# summed graph embaddings features
graph_sequences = np.load("../data/graph_sequences.npy")
Xgs_train = graph_sequences[target_traintest['TARGET'] != 'test']
Xgs_test = graph_sequences[target_traintest['TARGET'] == 'test']


# friends&user features
friends_users = pd.read_csv("../data/user_friends_embeddings.csv")
friends_users_target = pd.merge(target_traintest, friends_users, on="CLIENT_ID", how="left")
ff_train = friends_users_target[friends_users_target["TARGET"] != "test"]
ff_test  = friends_users_target[friends_users_target["TARGET"] == "test"]
Xff_train, Xff_test = ff_train.drop(["TARGET"], axis=1), ff_test.drop(["TARGET"], axis=1)
y_train = ff_train["TARGET"].apply(int)

Xff_train.fillna(0, inplace=True)
Xff_test.fillna(0, inplace=True)

mask = np.random.choice(Xff_train.shape[0], size=Xff_train.shape[0], replace=False)
ratio = int(0.8 * len(mask))

Xff_train, Xff_val = Xff_train.iloc[mask[:ratio]], Xff_train.iloc[mask[ratio:]]
y_train, y_val = y_train.iloc[mask[:ratio]], y_train.iloc[mask[ratio:]]

Xgs_train, Xgs_val = Xgs_train[mask[:ratio]], Xgs_train[mask[ratio:]]

In [133]:
# apriori weights for class imbalance
class_1_count = len(target_traintest[target_traintest['TARGET'] == '1'])
class_0_count = len(target_traintest[target_traintest['TARGET'] == '0'])
class_0_count, class_1_count
class_0_weight = class_1_count / (class_0_count + class_1_count)
class_1_weight = class_0_count / (class_0_count + class_1_count)
class_0_weight, class_1_weight

(0.2741932952476615, 0.7258067047523385)

In [134]:
# get learned proba features
preds_a = model_a.predict_proba(Xgs_train)[:, 1]
preds_b = model_b.predict_proba(Xff_train)[:, 1]

X_preds_train = np.stack([preds_a, preds_b], axis=1)

clf = LogisticRegression(random_state=42, class_weight={0: class_0_weight, 1: class_1_weight}).fit(X_preds_train, y_train)

X_preds_val = np.stack([model_a.predict_proba(Xgs_val)[:, 1], model_b.predict_proba(Xff_val)[:, 1]], axis=1)
probs = clf.predict_proba(X_preds_val)
roc_auc_score(y_val, probs[:, 1])

0.6173079690492267

In [135]:
clf.coef_

array([[5.79025102, 5.8515054 ]])

# Предсказывание на дообученных катбустах по всей выборке

In [136]:
model_a_full = CatBoostClassifier().load_model('catboosts/ctb_graph_full.ctb')
model_b_full = CatBoostClassifier().load_model('catboosts/ctb_friend_features_full.ctb')

In [137]:
# do same, merge all features for submit_user_target_df
# get learned proba features
preds_a = model_a_full.predict_proba(Xgs_test)[:, 1]
preds_b = model_b_full.predict_proba(Xff_test)[:, 1]

preds = np.stack([preds_a, preds_b], axis=1)
X_submit = preds
submit_probas = clf.predict_proba(X_submit)[:, 1]

In [138]:
result = target_traintest[target_traintest["TARGET"] == 'test'].reset_index(drop=True)
result["TARGET"] = submit_probas

In [139]:
result.drop("RETRO_DT", inplace=True, axis=1)
result.to_csv("submit.csv", index=False)

In [104]:
submit_probas

array([0.38436508, 0.35052214, 0.37131258, ..., 0.32375402, 0.29108237,
       0.37434077])

In [73]:
subm_ind_to_client_id = target_traintest[target_traintest['TARGET'] == 'test']['CLIENT_ID'].reset_index(drop=True)

In [89]:
subm_ind_to_client_id.head().values

array([1025140, 1029732, 1079794, 1116331, 1136822])

In [90]:
submit_probas[0]

0.3843650825569296

In [105]:
submit_probas = pd.DataFrame(submit_probas)
submit_probas.rename(columns={0: 'TARGET'}, inplace=True)
#submit_probas.index = subm_ind_to_client_id.values
submit_probas

Unnamed: 0,TARGET
0,0.384365
1,0.350522
2,0.371313
3,0.332309
4,0.346852
...,...
31853,0.388096
31854,0.367897
31855,0.323754
31856,0.291082


In [106]:
submission_df = pd.read_csv('../sample_submission.csv')
submission_df[]

(31858, 2)

In [102]:
submission_df = submission_df.drop('TARGET', axis=1)
submission_df = submission_df.merge(submit_probas, how='left', left_on='CLIENT_ID', right_index=True)
submission_df.head()

Unnamed: 0,CLIENT_ID,TARGET
0,1504364,
1,1592092,
2,1582159,
3,1431753,
4,1571196,


In [51]:
submission_df.to_csv('../gb_submission.csv', index=False)

# Попробуем композицию трех моделей

In [154]:
target_traintest.head()

Unnamed: 0,CLIENT_ID,RETRO_DT,TARGET
0,1011725,20210501,0
1,1018784,20210501,1
2,1021812,20210501,0
3,1024003,20210501,0
4,1025140,20210501,test


In [170]:
model_a = CatBoostClassifier().load_model('catboosts/ctb_graph.ctb')
model_c = CatBoostClassifier().load_model('catboosts/ctb_friends_features.ctb')
model_d = CatBoostClassifier().load_model('catboosts/ctb_user_features.ctb')

In [171]:
# summed graph embaddings features
graph_sequences = np.load("../data/graph_sequences.npy")
Xgs_train = graph_sequences[target_traintest['TARGET'] != 'test']
Xgs_test = graph_sequences[target_traintest['TARGET'] == 'test']


# friends features
ff_target = pd.read_csv('../data/features_friends_target.csv')
#ff_target = pd.merge(target_traintest, friends_features, on='CLIENT_ID', how='left')
ff_test = ff_target[ff_target['TARGET'] == 'test']
ff_train = ff_target[ff_target['TARGET'] != 'test']
Xff_train, Xff_test = ff_train.drop(["TARGET"], axis=1), ff_test.drop(["TARGET"], axis=1)
y_train = ff_train["TARGET"].apply(int)


# user features
uf_target = pd.read_csv('../data/features_user_target.csv')
#uf_target = pd.merge(target_traintest, user_features, on='CLIENT_ID', how='left')
uf_train = uf_target[uf_target['TARGET'] != 'test']
uf_test = uf_target[uf_target['TARGET'] == 'test']
Xuf_train, Xuf_test = uf_train.drop(["TARGET"], axis=1), uf_test.drop(["TARGET"], axis=1)

Xff_train.fillna(0, inplace=True)
Xff_test.fillna(0, inplace=True)
Xuf_train.fillna(0, inplace=True)
Xuf_test.fillna(0, inplace=True)

mask = np.random.choice(Xff_train.shape[0], size=Xff_train.shape[0], replace=False)
ratio = int(0.8 * len(mask))

Xff_train, Xff_val = Xff_train.iloc[mask[:ratio]], Xff_train.iloc[mask[ratio:]]
Xuf_train, Xuf_val = Xuf_train.iloc[mask[:ratio]], Xuf_train.iloc[mask[ratio:]]
y_train, y_val = y_train.iloc[mask[:ratio]], y_train.iloc[mask[ratio:]]

Xgs_train, Xgs_val = Xgs_train[mask[:ratio]], Xgs_train[mask[ratio:]]

In [172]:
preds_a = model_a.predict_proba(Xgs_train)[:, 1]
preds_c = model_c.predict_proba(Xff_train)[:, 1]
preds_d = model_d.predict_proba(Xuf_train)[:, 1]

X_preds_train = np.stack([preds_a, preds_d, preds_c], axis=1)

clf = LogisticRegression(random_state=42, class_weight={0: class_0_weight, 1: class_1_weight}).fit(X_preds_train, y_train)

X_preds_val = np.stack([
    model_a.predict_proba(Xgs_val)[:, 1],
    model_c.predict_proba(Xff_val)[:, 1],
    model_d.predict_proba(Xuf_val)[:, 1]
], axis=1)

probs = clf.predict_proba(X_preds_val)
roc_auc_score(y_val, probs[:, 1])

0.6951701596804765

In [173]:
clf.coef_

array([[ 4.36128685, 24.0825212 , 11.38729949]])

In [176]:
# lets try submit
preds_a = model_a.predict_proba(Xgs_test)[:, 1]
preds_c = model_c.predict_proba(Xff_test)[:, 1]
preds_d = model_d.predict_proba(Xuf_test)[:, 1]

preds = np.stack([preds_a, preds_c, preds_d], axis=1)
X_submit = preds
submit_probas = clf.predict_proba(X_submit)[:, 1]

In [177]:
result = target_traintest[target_traintest["TARGET"] == 'test'].reset_index(drop=True)
result["TARGET"] = submit_probas
result.drop("RETRO_DT", inplace=True, axis=1)
result.to_csv("gb3_submit.csv", index=False)