# Baseline

In [70]:
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np

## Загрузка данных

In [2]:
train_df = pd.read_parquet('train.parquet')
test_df = pd.read_parquet('test.parquet')

In [3]:
train_df.head()

Unnamed: 0,id,rko_start_months,max_end_fact_fin_deals,max_end_plan_non_fin_deals,max_start_fin_deals,max_start_non_fin_deals,min_end_fact_fin_deals,min_end_plan_non_fin_deals,min_start_fin_deals,min_start_non_fin_deals,...,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,target_1,target_2,total_target
0,0,-1.279132,,,,,,,,,...,0.352516,-0.08131,0.210189,1.140808,0.647477,0.810887,1.909568,0,0,0
1,1,-1.142591,,,,,,,,,...,-0.188287,-0.650616,-0.316563,-1.136536,-0.127326,-0.147319,-0.517805,0,0,0
2,2,1.81227,,,,,,,,,...,-0.323487,0.632731,2.541558,3.017482,0.033394,0.019972,0.578428,0,0,0
3,3,-0.479407,,0.356677,,-0.332867,,-0.704164,,0.905748,...,-0.323487,-0.003549,-0.219016,0.228624,-0.428185,-0.220932,-0.596108,0,0,0
4,4,-1.50369,,,,,,,,,...,-0.323487,0.182726,-0.287299,-0.528921,-0.563335,-0.021506,-0.361201,0,0,0


In [4]:
test_df.head()

Unnamed: 0,id,rko_start_months,max_end_fact_fin_deals,max_end_plan_non_fin_deals,max_start_fin_deals,max_start_non_fin_deals,min_end_fact_fin_deals,min_end_plan_non_fin_deals,min_start_fin_deals,min_start_non_fin_deals,...,cnt_days_deb_g_oper_3m,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m
0,300000,-1.076009,,,,,,,,,...,-0.51461,-0.045603,-0.131653,-0.323487,-0.16994,-0.316563,-0.639934,-0.1716,-0.237763,-0.596108
1,300001,-0.33507,,,,,,,,,...,-0.51461,-0.045603,-0.131653,-0.323487,-0.16994,-0.316563,-0.639934,-0.17158,-0.222361,-0.517805
2,300002,0.920834,,,,,,,,,...,-0.463706,-0.045603,-0.131653,-0.323487,-0.16994,-0.316563,-0.639934,0.020943,-0.160754,-0.204596
3,300003,-0.772897,-0.740714,,-0.635375,,-0.595302,,-0.738424,,...,-0.51461,-0.045603,-0.131653,-0.323487,-0.137993,-0.277545,-0.381706,-0.1716,-0.237763,-0.596108
4,300004,-0.665342,,,,,,,,,...,-0.209184,-0.045603,-0.131653,-0.323487,-0.139926,-0.277545,-0.381706,-0.158918,-0.222361,-0.517805


## Обработка данных

Делаем 2 датасета: 1-ый с таргетом "target_1", 2-ой с таргетом "target_2". Финальный таргет будет считаться формулой max(target_1, target_2)

In [5]:
train_df_1 = train_df.drop(["id", "total_target", "target_2"], axis=1)
train_df_2 = train_df.drop(["id", "total_target", "target_1"], axis=1)

Преобразуем тип категориальных признаков

In [6]:
cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]

In [59]:
# Label encoder
label_encoder = LabelEncoder()

for column in cat_cols:
    train_df_1[column] = label_encoder.fit_transform(train_df_1[column])
    train_df_2[column] = label_encoder.fit_transform(train_df_2[column])
    test_df[column] = label_encoder.fit_transform(test_df[column])

In [8]:
# Other cleaning
def clean_dataset(df, cat_cols, fillna_percentage):
    df_copy = df.copy()
    df_copy.drop_duplicates(inplace=True)
    for column in tqdm(df_copy.columns):
        if column not in cat_cols:
            df_copy[column] = df_copy[column].astype(float)
            
            if df_copy[column].nunique() == 1:
                df_copy.drop(column, axis=1, inplace=True)
                
            nan_percentage = df_copy[column].isnull().mean() * 100
            if nan_percentage > fillna_percentage:
                df_copy.drop(column, axis=1, inplace=True)
            else:
                df_copy[column].fillna(df_copy[column].mean(), inplace=True)

    return df_copy

In [9]:
clean_df_1 = clean_dataset(train_df_1, cat_cols, 60)
clean_df_2 = clean_dataset(train_df_2, cat_cols, 60)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 38.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 38.71it/s]


In [60]:
clean_test = clean_dataset(test_df, cat_cols, 60)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 135.52it/s]


In [177]:
clean_test

Unnamed: 0,id,rko_start_months,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_days_deb_g_oper_3m,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m
0,300000.0,-1.076009,-0.148918,-0.179080,-0.106596,-0.148477,41,3942,2753,222,...,-0.514610,-0.045603,-0.131653,-0.323487,-0.169940,-0.316563,-0.639934,-0.171600,-0.237763,-0.596108
1,300001.0,-0.335070,-0.148947,-0.179163,-0.106596,-0.148468,20,1631,585,156,...,-0.514610,-0.045603,-0.131653,-0.323487,-0.169940,-0.316563,-0.639934,-0.171580,-0.222361,-0.517805
2,300002.0,0.920834,0.168571,0.157359,0.300480,0.169247,28,3942,2753,222,...,-0.463706,-0.045603,-0.131653,-0.323487,-0.169940,-0.316563,-0.639934,0.020943,-0.160754,-0.204596
3,300003.0,-0.772897,15.663146,10.264577,31.918232,15.673477,21,3942,2753,158,...,-0.514610,-0.045603,-0.131653,-0.323487,-0.137993,-0.277545,-0.381706,-0.171600,-0.237763,-0.596108
4,300004.0,-0.665342,0.311747,0.125395,0.825424,0.312513,21,1328,585,185,...,-0.209184,-0.045603,-0.131653,-0.323487,-0.139926,-0.277545,-0.381706,-0.158918,-0.222361,-0.517805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,399995.0,-0.462181,-0.136041,-0.125734,-0.106464,-0.135554,25,2556,303,177,...,-0.463706,-0.044762,-0.101828,-0.188287,0.035977,-0.258035,-0.317149,-0.060968,-0.068343,0.108614
99996,399996.0,1.162020,0.016445,-0.005244,0.118241,0.017027,23,1450,585,222,...,-0.514610,-0.045250,-0.101828,-0.188287,-0.169707,-0.258035,-0.446263,-0.077180,0.024067,0.265219
99997,399997.0,-0.277489,-0.147806,-0.160523,-0.106596,-0.147327,25,3942,2753,222,...,-0.514610,-0.045603,-0.131653,-0.323487,-0.169940,-0.316563,-0.639934,-0.171600,-0.237763,-0.596108
99998,399998.0,-0.743564,-0.146039,-0.171194,-0.106595,-0.145559,23,3942,2753,222,...,0.147146,-0.045603,-0.131653,-0.323487,-0.169940,-0.316563,-0.639934,-0.142708,-0.022138,0.421823


## Обучение 1-ой модели на target_1

In [22]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [171]:
X = clean_df_1.drop("target_1", axis=1)
y = clean_df_1['target_1']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

In [41]:
best_value = float('-inf')

In [45]:
def objective(trial, study):
    # optuna logic
    global best_value
    
    model_name = trial.suggest_categorical('model_name', ['KNN', 'LGBM', 'RF', 'CatBoost', 'XGBoost', 'LogReg'])
    
    if model_name == 'KNN':
        n_neighbors = trial.suggest_int('n_neighbors', 1, 10)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
    elif model_name == 'LGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 2, 32)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
        model = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    elif model_name == 'RF':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 2, 32)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    elif model_name == 'CatBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        depth = trial.suggest_int('depth', 2, 10)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
        model = CatBoostClassifier(n_estimators=n_estimators, depth=depth, learning_rate=learning_rate, verbose=0)
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 2, 32)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, verbosity=0)
    elif model_name == 'LogReg':
        C = trial.suggest_loguniform('C', 0.001, 10)
        model = LogisticRegression(C=C)
    
    model.fit(X_train, y_train)
    
    y_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_proba)
    
    if roc_auc > best_value:
        study.set_user_attr('best_model', model)
        study.set_user_attr('best_model_params', trial.params)
        best_value = roc_auc

    print(f'{model_name} learned; roc-auc: {roc_auc}')
    
    return roc_auc

In [46]:
study = optuna.create_study(direction='maximize')

[I 2023-10-07 00:22:07,769] A new study created in memory with name: no-name-d7716410-a55b-4353-8d50-16ec1e901aea


In [47]:
study.optimize(lambda trial: objective(trial, study), n_trials=10)

best_model = study.user_attrs['best_model']
best_params = study.user_attrs['best_model_params']
best_roc_auc = study.best_value

print(f"Лучшая модель: {best_model}")
print(f"Лучшие параметры для лучшей модели: {best_params}")
print(f"Лучший ROC-AUC для лучшей модели: {best_roc_auc}")

  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2023-10-07 00:22:51,632] Trial 0 finished with value: 0.9130909359642315 and parameters: {'model_name': 'XGBoost', 'n_estimators': 73, 'max_depth': 14, 'learning_rate': 0.012519067706074711}. Best is trial 0 with value: 0.9130909359642315.


XGBoost learned; roc-auc: 0.9130909359642315


  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2023-10-07 00:23:16,332] Trial 1 finished with value: 0.9185776344140161 and parameters: {'model_name': 'CatBoost', 'n_estimators': 145, 'depth': 8, 'learning_rate': 0.05845209506840664}. Best is trial 1 with value: 0.9185776344140161.


CatBoost learned; roc-auc: 0.9185776344140161


[I 2023-10-07 00:30:16,583] Trial 2 finished with value: 0.912728723197629 and parameters: {'model_name': 'RF', 'n_estimators': 72, 'max_depth': 18}. Best is trial 1 with value: 0.9185776344140161.


RF learned; roc-auc: 0.912728723197629


[I 2023-10-07 00:43:15,857] Trial 3 finished with value: 0.9133428918197791 and parameters: {'model_name': 'RF', 'n_estimators': 121, 'max_depth': 22}. Best is trial 1 with value: 0.9185776344140161.


RF learned; roc-auc: 0.9133428918197791


  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)


[LightGBM] [Info] Number of positive: 13264, number of negative: 226736
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.176477 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19106
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055267 -> initscore=-2.838733
[LightGBM] [Info] Start training from score -2.838733


[I 2023-10-07 00:43:24,533] Trial 4 finished with value: 0.9005915216956122 and parameters: {'model_name': 'LGBM', 'n_estimators': 95, 'max_depth': 16, 'learning_rate': 0.0031964567435740173}. Best is trial 1 with value: 0.9185776344140161.


LGBM learned; roc-auc: 0.9005915216956122


[I 2023-10-07 00:52:47,394] Trial 5 finished with value: 0.9084692142158056 and parameters: {'model_name': 'RF', 'n_estimators': 81, 'max_depth': 30}. Best is trial 1 with value: 0.9185776344140161.


RF learned; roc-auc: 0.9084692142158056


  C = trial.suggest_loguniform('C', 0.001, 10)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2023-10-07 00:52:53,959] Trial 6 finished with value: 0.643184186024226 and parameters: {'model_name': 'LogReg', 'C': 5.58384774633466}. Best is trial 1 with value: 0.9185776344140161.


LogReg learned; roc-auc: 0.643184186024226


[I 2023-10-07 00:54:20,189] Trial 7 finished with value: 0.8783311440311774 and parameters: {'model_name': 'RF', 'n_estimators': 117, 'max_depth': 2}. Best is trial 1 with value: 0.9185776344140161.


RF learned; roc-auc: 0.8783311440311774


  C = trial.suggest_loguniform('C', 0.001, 10)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2023-10-07 00:54:26,542] Trial 8 finished with value: 0.7158054730252934 and parameters: {'model_name': 'LogReg', 'C': 0.1853617630465456}. Best is trial 1 with value: 0.9185776344140161.
  C = trial.suggest_loguniform('C', 0.001, 10)


LogReg learned; roc-auc: 0.7158054730252934


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2023-10-07 00:54:33,055] Trial 9 finished with value: 0.7251268786885681 and parameters: {'model_name': 'LogReg', 'C': 0.7811958508752938}. Best is trial 1 with value: 0.9185776344140161.


LogReg learned; roc-auc: 0.7251268786885681
Лучшая модель: <catboost.core.CatBoostClassifier object at 0x000001B738EE38E0>
Лучшие параметры для лучшей модели: {'model_name': 'CatBoost', 'n_estimators': 145, 'depth': 8, 'learning_rate': 0.05845209506840664}
Лучший ROC-AUC для лучшей модели: 0.9185776344140161


# Обучение 2-ой модели на target_2

In [166]:
X = clean_df_2.drop("target_2", axis=1)
y = clean_df_2['target_2']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

In [167]:
best_value = float('-inf')

In [168]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(lambda trial: objective(trial, study2), n_trials=10)

best_model2 = study2.user_attrs['best_model']
best_params2 = study2.user_attrs['best_model_params']
best_roc_auc2 = study2.best_value

print(f"Лучшая модель: {best_model2}")
print(f"Лучшие параметры для лучшей модели: {best_params2}")
print(f"Лучший ROC-AUC для лучшей модели: {best_roc_auc2}")

[I 2023-10-07 13:15:26,734] A new study created in memory with name: no-name-db026f69-8504-4b12-8792-6f614ac1f47c
  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2023-10-07 13:15:38,225] Trial 0 finished with value: 0.7478387906800186 and parameters: {'model_name': 'CatBoost', 'n_estimators': 118, 'depth': 5, 'learning_rate': 0.021321570230776788}. Best is trial 0 with value: 0.7478387906800186.


CatBoost learned; roc-auc: 0.7478387906800186


[I 2023-10-07 13:16:55,234] Trial 1 finished with value: 0.5401268927254532 and parameters: {'model_name': 'KNN', 'n_neighbors': 6}. Best is trial 0 with value: 0.7478387906800186.


KNN learned; roc-auc: 0.5401268927254532


  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)


[LightGBM] [Info] Number of positive: 6364, number of negative: 233636
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182836 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19106
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026517 -> initscore=-3.603107
[LightGBM] [Info] Start training from score -3.603107


[I 2023-10-07 13:17:05,852] Trial 2 finished with value: 0.7648114013441629 and parameters: {'model_name': 'LGBM', 'n_estimators': 107, 'max_depth': 18, 'learning_rate': 0.01596084293885124}. Best is trial 2 with value: 0.7648114013441629.


LGBM learned; roc-auc: 0.7648114013441629


[I 2023-10-07 13:28:48,308] Trial 3 finished with value: 0.7275529745150435 and parameters: {'model_name': 'RF', 'n_estimators': 96, 'max_depth': 24}. Best is trial 2 with value: 0.7648114013441629.


RF learned; roc-auc: 0.7275529745150435


[I 2023-10-07 13:30:21,430] Trial 4 finished with value: 0.5428799757725059 and parameters: {'model_name': 'KNN', 'n_neighbors': 7}. Best is trial 2 with value: 0.7648114013441629.


KNN learned; roc-auc: 0.5428799757725059


[I 2023-10-07 13:40:30,729] Trial 5 finished with value: 0.7108684487032279 and parameters: {'model_name': 'RF', 'n_estimators': 74, 'max_depth': 31}. Best is trial 2 with value: 0.7648114013441629.


RF learned; roc-auc: 0.7108684487032279


[I 2023-10-07 13:41:58,195] Trial 6 finished with value: 0.5475396031788913 and parameters: {'model_name': 'KNN', 'n_neighbors': 8}. Best is trial 2 with value: 0.7648114013441629.


KNN learned; roc-auc: 0.5475396031788913


  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2023-10-07 13:42:11,546] Trial 7 finished with value: 0.753334258606871 and parameters: {'model_name': 'CatBoost', 'n_estimators': 107, 'depth': 6, 'learning_rate': 0.028690151087439053}. Best is trial 2 with value: 0.7648114013441629.


CatBoost learned; roc-auc: 0.753334258606871


[I 2023-10-07 13:57:59,637] Trial 8 finished with value: 0.7522532262314077 and parameters: {'model_name': 'RF', 'n_estimators': 178, 'max_depth': 15}. Best is trial 2 with value: 0.7648114013441629.


RF learned; roc-auc: 0.7522532262314077


[I 2023-10-07 14:07:24,328] Trial 9 finished with value: 0.7146157431652441 and parameters: {'model_name': 'RF', 'n_estimators': 73, 'max_depth': 27}. Best is trial 2 with value: 0.7648114013441629.


RF learned; roc-auc: 0.7146157431652441
Лучшая модель: LGBMClassifier(learning_rate=0.01596084293885124, max_depth=18,
               n_estimators=107)
Лучшие параметры для лучшей модели: {'model_name': 'LGBM', 'n_estimators': 107, 'max_depth': 18, 'learning_rate': 0.01596084293885124}
Лучший ROC-AUC для лучшей модели: 0.7648114013441629


## Выгрузка результатов

In [150]:
y_pred = np.where((best_model.predict_proba(X_train)[:, 1] > 0.5), 1, 0)

In [172]:
y_pred_1 = best_model.predict_proba(X_test)[:, 1]

In [173]:
y_pred_2 = best_model2.predict_proba(X_test)[:, 1]



In [203]:
y_pred_test1 = best_model.predict_proba(clean_test.drop("id", axis=1))[:, 1]
y_pred_test2 = best_model2.predict_proba(clean_test.drop("id", axis=1))[:, 1]



In [204]:
test_score = [max(a, b) for a, b in zip(y_pred_test1, y_pred_test2)]

In [205]:
sample_submission_df = pd.read_csv("sample_submission.csv")

In [206]:
sample_submission_df.head()

Unnamed: 0,id,score
0,300000,0.5
1,300001,0.5
2,300002,0.5
3,300003,0.5
4,300004,0.5


In [207]:
sample_submission_df["score"] = test_score

In [208]:
sample_submission_df.head()

Unnamed: 0,id,score
0,300000,True
1,300001,True
2,300002,False
3,300003,False
4,300004,False


In [202]:
sample_submission_df.to_csv("2modelmax.csv", index=False)

# Voting Classifier

In [209]:
from sklearn.ensemble import VotingClassifier

In [214]:
catboost_model = CatBoostClassifier(
    n_estimators=145,  # Гиперпараметры CatBoost
    depth=8,
    learning_rate=0.05845209506840664,
    verbose = 0
)

rf_model = RandomForestClassifier(
    n_estimators=121,  # Гиперпараметры RandomForest
    max_depth=22
)

xgb_model = XGBClassifier(
    n_estimators=73,  # Гиперпараметры XGBoost
    max_depth=14,
    learning_rate=0.012519067706074711
)

# Создайте VotingClassifier
voting_classifier1 = VotingClassifier(
    estimators=[
        ('catboost', catboost_model),
        ('random_forest', rf_model),
        ('xgboost', xgb_model)
    ],
    voting='soft'  # 'soft' или 'hard', в зависимости от вашей стратегии голосования
)

voting_classifier1.fit(X_train, y_train)

In [215]:
y_pred_vote1 = voting_classifier1.predict_proba(clean_test.drop("id", axis=1))[:, 1]

In [217]:
catboost_model_2 = CatBoostClassifier(
    n_estimators=118,  # Гиперпараметры CatBoost
    depth=5,
    learning_rate=0.021321570230776788,
    verbose = 0
)

rf_model_2 = RandomForestClassifier(
    n_estimators=178,  # Гиперпараметры RandomForest
    max_depth=15
)

lgbm_model = LGBMClassifier(
    n_estimators=107,  # Гиперпараметры XGBoost
    max_depth=18,
    learning_rate=0.01596084293885124
)

# Создайте VotingClassifier
voting_classifier2 = VotingClassifier(
    estimators=[
        ('catboost', catboost_model_2),
        ('random_forest', rf_model_2),
        ('xgboost', lgbm_model)
    ],
    voting='soft'  # 'soft' или 'hard', в зависимости от вашей стратегии голосования
)

# Обучите VotingClassifier на обучающих данных
voting_classifier2.fit(X_train, y_train)

# Получите предсказания от VotingClassifier
y_pred_vote2 = voting_classifier2.predict_proba(clean_test.drop("id", axis=1))[:, 1]

[LightGBM] [Info] Number of positive: 13264, number of negative: 226736
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.189597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19106
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055267 -> initscore=-2.838733
[LightGBM] [Info] Start training from score -2.838733


In [218]:
test_score2 = [max(a, b) for a, b in zip(y_pred_vote1, y_pred_vote2)]

In [219]:
sample_submission_df["score"]

0         True
1         True
2        False
3        False
4        False
         ...  
99995    False
99996    False
99997    False
99998    False
99999    False
Name: score, Length: 100000, dtype: bool

In [221]:
sample_submission_df["score"] = test_score2

In [223]:
sample_submission_df.to_csv("2modelvotemax.csv", index=False)