In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import missingno as msno
import seaborn as sns
import scipy.stats as st
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xg
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from eli5.sklearn import PermutationImportance
from eli5 import show_weights
from sklearn.tree import DecisionTreeRegressor
from bayes_opt import BayesianOptimization

In [124]:
#Загрузим данные из файла.
df = pd.read_excel('/home/rik/Рабочий стол/МИФИ/учеба/мо/curse/data.xlsx', index_col=0)
df.head()

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.239374,175.482382,28.125,5.094096,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,...,0,0,0,0,0,0,0,0,3,0
1,0.771831,5.402819,7.0,3.961417,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,...,0,0,0,0,0,0,0,0,3,0
2,223.808778,161.14232,0.72,2.627117,2.627117,0.543231,0.543231,0.260923,42.1875,446.808,...,0,0,0,0,0,0,0,0,3,0
3,1.705624,107.855654,63.235294,5.09736,5.09736,0.390603,0.390603,0.377846,41.862069,398.679,...,0,0,0,0,0,0,0,0,4,0
4,107.131532,139.270991,1.3,5.15051,5.15051,0.270476,0.270476,0.429038,36.514286,466.713,...,0,0,0,0,0,0,0,0,0,0


In [125]:
print("train.shape = {} rows, {} cols".format(*df.shape))

train.shape = 1001 rows, 213 cols


# Предобработка данных.

In [126]:
def transform(df): 
    discrete_feature = [feature for feature in df.columns if len(df[feature].unique())<25]
    count_discrete_feature = {}
    for i in tqdm(discrete_feature):
        cnt = 0
        for j in df[i]:
            if j != 0:
                cnt += 1
        count_discrete_feature[f'{i}'] = cnt
    count_discrete_feature = pd.Series(count_discrete_feature)
    df = df.drop(columns=count_discrete_feature[count_discrete_feature == 0].index.to_list())
    df = df.dropna()
    feature_del = ['BertzCT', 'Chi0', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v',
     'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan3', 'HeavyAtomCount', 'HeavyAtomMolWt', 'Kappa1', 'Kappa2',
     'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MolMR', 'MolWt', 'NHOHCount', 'NumAromaticCarbocycles',
     'NumHAcceptors', 'NumHeteroatoms', 'NumSaturatedCarbocycles', 'NumValenceElectrons', 'SMR_VSA4', 'SMR_VSA9',
     'SlogP_VSA6', 'TPSA', 'VSA_EState2', 'VSA_EState3', 'VSA_EState6', 'fr_Al_OH_noTert', 'fr_COO', 'fr_COO2',
     'fr_C_O_noCOO', 'fr_Nhpyrrole', 'fr_benzene', 'fr_nitro_arom_nonortho', 'fr_phenol', 'fr_phenol_noOrthoHbond',
    'MinAbsPartialCharge', 'Chi2n']
    df = df.drop(columns=feature_del)
    return df

In [127]:
df = transform(df)

100%|███████████████████████████████████████| 110/110 [00:00<00:00, 4135.88it/s]


## Обучение нескольких моделей, их сравнение. 

In [129]:
target = df['IC50, mM']
data = df.drop(['CC50, mM','IC50, mM','SI'], axis=1)

In [130]:
x_train, x_valid = train_test_split(
    data, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))

x_train.shape = 798 rows, 144 cols
x_valid.shape = 200 rows, 144 cols


In [131]:
# Создадим модель дерева решений
tree = DecisionTreeRegressor(random_state=27)
# Создадим модель CatBoost
model = CatBoostRegressor(random_state=27)
# Создадим модель lightgbm
model_LGBM = LGBMRegressor(random_state=27)
# Создадим модель xgboost
xgb_r = xg.XGBRegressor() 
# обучение моделей
tree.fit(x_train, y_train)
model.fit(x_train, y_train, verbose=0)
model_LGBM.fit(x_train, y_train, verbose=0)
xgb_r.fit(x_train, y_train, verbose=0)
# предсказание ответов для тренеровочной выборки
y_pred_tree_train = tree.predict(x_train)
y_pred_сat_Boost_train = model.predict(x_train)
y_pred_LGBM_train = model_LGBM.predict(x_train)
y_pred_xgb_train = xgb_r.predict(x_train)
# предсказание ответов для тестовой выборки
y_pred_tree = tree.predict(x_valid)
y_pred_сat_Boost = model.predict(x_valid)
y_pred_LGBM = model_LGBM.predict(x_valid)
y_pred_xgb = xgb_r.predict(x_valid)

print("Train tree R2: ", r2_score(y_train, y_pred_tree_train))
print("Train tree RMSE: ", np.sqrt(mse(y_train, y_pred_tree_train)))
print("Validation tree R2: ", r2_score(y_valid, y_pred_tree))
print("Validation tree RMSE: ", np.sqrt(mse(y_valid, y_pred_tree)))
print('*' * 50)
print("Train сat_Boost R2: ", r2_score(y_train, y_pred_сat_Boost_train))
print("Train сat_Boost RMSE: ", np.sqrt(mse(y_train, y_pred_сat_Boost_train)))
print("Validation сat_Boost R2: ", r2_score(y_valid, y_pred_сat_Boost))
print("Validation сat_Boost RMSE: ", np.sqrt(mse(y_valid, y_pred_сat_Boost)))
print('*' * 50)
print("Train LGBM R2: ", r2_score(y_train, y_pred_LGBM_train))
print("Train LGBM RMSE: ", np.sqrt(mse(y_train, y_pred_LGBM_train)))
print("Validation LGBM R2: ", r2_score(y_valid, y_pred_LGBM))
print("Validation LGBM RMSE: ", np.sqrt(mse(y_valid, y_pred_LGBM)))
print('*' * 50)
print("Train xgb R2: ", r2_score(y_train, y_pred_xgb_train))
print("Train xgb RMSE: ", np.sqrt(mse(y_train, y_pred_xgb_train)))
print("Validation xgb R2: ", r2_score(y_valid, y_pred_xgb))
print("Validation xgb RMSE: ", np.sqrt(mse(y_valid, y_pred_xgb)))



Train tree R2:  0.889131715332103
Train tree RMSE:  136.88914000017067
Validation tree R2:  -0.01042255300655337
Validation tree RMSE:  355.49894124921184
**************************************************
Train сat_Boost R2:  0.8644102035204829
Train сat_Boost RMSE:  151.38360227787837
Validation сat_Boost R2:  0.5112163085662592
Validation сat_Boost RMSE:  247.25503804391244
**************************************************
Train LGBM R2:  0.8256318508892195
Train LGBM RMSE:  171.67177981743524
Validation LGBM R2:  0.5163119338288629
Validation LGBM RMSE:  245.96283050028833
**************************************************
Train xgb R2:  0.8889492425616939
Train xgb RMSE:  137.001743308657
Validation xgb R2:  0.46019583877053827
Validation xgb RMSE:  259.8393433670049


Из приведенных данных выше мы видим переобучение всех моделей. Наилучшие результаты показали модели СatBoost и LGBM. С ними и продолжим работать.

# Подбор признаков, их анализ и оценка важности.

**CatBoost**

In [107]:
col = x_train.columns.tolist()
estimator_cat_boost = PermutationImportance(
    model, scoring= "neg_mean_squared_error", random_state=27
)
importance_cat_boost = estimator_cat_boost.fit(x_train, y_train)



In [108]:
show_weights(importance_cat_boost, feature_names=col, top=x_train.shape[1])

Weight,Feature
11631.9793  ± 2136.7744,BCUT2D_MWLOW
6659.8472  ± 1692.5280,FpDensityMorgan2
4352.0578  ± 2521.6342,VSA_EState4
3805.7491  ± 614.9132,MinAbsEStateIndex
3626.3694  ± 895.0285,EState_VSA3
3338.5074  ± 533.1589,Ipc
3268.4161  ± 1211.9014,MolLogP
3019.8144  ± 627.2794,EState_VSA5
2892.0419  ± 1664.9561,Chi0n
2597.5190  ± 576.1394,EState_VSA7


Оставим только те признаки которые которые улучшают работу модели. И посмотрим как изменится метрика

In [109]:
feature_cat_boost = ['BCUT2D_MWLOW','FpDensityMorgan2','MinAbsEStateIndex','VSA_EState8','VSA_EState4','EState_VSA5',
'Ipc','PEOE_VSA7','MolLogP','SlogP_VSA2','EState_VSA3','Chi0n','PEOE_VSA9','EState_VSA6','PEOE_VSA6','VSA_EState7',
'BCUT2D_MRLOW','EState_VSA7','qed','BalabanJ','MaxPartialCharge','EState_VSA4','EState_VSA9','MinPartialCharge',
'MaxEStateIndex','SMR_VSA10','BCUT2D_MRHI','VSA_EState5','BCUT2D_MWHI','SlogP_VSA5','EState_VSA1','BCUT2D_LOGPHI',
'BCUT2D_LOGPLOW','BCUT2D_CHGHI','PEOE_VSA10','SPS','fr_C_S','EState_VSA2','fr_Ar_N','PEOE_VSA1','VSA_EState10',
'SMR_VSA6','PEOE_VSA3','AvgIpc','SlogP_VSA4','PEOE_VSA8','EState_VSA8','HallKierAlpha','FractionCSP3','VSA_EState1',
'MinEStateIndex','NumSaturatedHeterocycles','NumHDonors','VSA_EState9','EState_VSA10']

In [110]:
target_cb = df['IC50, mM']
data_cb = df[feature_cat_boost]

In [111]:
x_train_cb, x_valid_cb = train_test_split(
    data_cb, train_size=0.8, random_state=1
)
y_train_cb, y_valid_cb = train_test_split(
    target_cb, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train_cb.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid_cb.shape))

x_train.shape = 798 rows, 55 cols
x_valid.shape = 200 rows, 55 cols


In [133]:
model = CatBoostRegressor(random_state=27)
model.fit(x_train_cb, y_train_cb, verbose=100)
y_pred_cb = model.predict(x_train_cb)
y_val_cb = model.predict(x_valid_cb)
print("Train R2: ", r2_score(y_train_cb, y_pred_cb))
print("Train RMSE: ", np.sqrt(mse(y_train_cb, y_pred_cb)))
print("Validation R2: ", r2_score(y_valid_cb, y_val_cb))
print("Validation RMSE: ", np.sqrt(mse(y_valid_cb, y_val_cb)))

Learning rate set to 0.039509
0:	learn: 406.8948349	total: 6.63ms	remaining: 6.62s
100:	learn: 259.8173803	total: 619ms	remaining: 5.51s
200:	learn: 225.1957812	total: 1.31s	remaining: 5.23s
300:	learn: 206.2182270	total: 2.13s	remaining: 4.94s
400:	learn: 187.1896966	total: 2.87s	remaining: 4.28s
500:	learn: 174.4352669	total: 3.64s	remaining: 3.63s
600:	learn: 166.0291147	total: 4.38s	remaining: 2.91s
700:	learn: 159.5545032	total: 5s	remaining: 2.13s
800:	learn: 154.7058831	total: 5.71s	remaining: 1.42s
900:	learn: 151.2224838	total: 6.33s	remaining: 696ms
999:	learn: 148.5116001	total: 7.12s	remaining: 0us
Train R2:  0.869506133837601
Train RMSE:  148.51160103760293
Validation R2:  0.5355583338266647
Validation RMSE:  241.01961213404743


Отобрав признаки мы улучшили метрики и немного снизили переобучение для модели CatBoost на вадидации R2: 0.54, RMSE: 241.02.

**lightgbm**

In [113]:
col = x_train.columns.tolist()
estimator_LGBM = PermutationImportance(
    model_LGBM, scoring= "neg_mean_squared_error", random_state=27
)
importance_LGBM = estimator_LGBM.fit(x_train, y_train)


In [114]:
show_weights(importance_LGBM, feature_names=col, top=x_train.shape[1])

Weight,Feature
13640.4373  ± 1256.4587,VSA_EState4
10639.5615  ± 3117.6103,BCUT2D_MWLOW
8379.7345  ± 1794.8888,EState_VSA5
8376.0174  ± 1714.5843,MolLogP
8011.1899  ± 3524.6957,Chi0n
7341.4316  ± 2595.3453,SlogP_VSA2
6551.9233  ± 621.9774,MaxPartialCharge
5080.5194  ± 1013.2736,SMR_VSA3
4943.2708  ± 1923.9932,MinAbsEStateIndex
4619.2539  ± 383.3610,FpDensityMorgan2


Оставим только те признаки которые которые улучшают работу модели. И посмотрим как изменится метрика

In [115]:
feature_LGBM = ['VSA_EState4','BCUT2D_MWLOW','EState_VSA5','MolLogP','Chi0n','SlogP_VSA2','MaxPartialCharge','SMR_VSA3',
'MinAbsEStateIndex','FpDensityMorgan2','VSA_EState8','qed','PEOE_VSA7','Ipc','BCUT2D_MRLOW','EState_VSA2','BCUT2D_CHGLO',
'MinPartialCharge','fr_Ar_NH','SPS','SlogP_VSA5','EState_VSA4','PEOE_VSA6','EState_VSA3','BCUT2D_LOGPHI',
'EState_VSA9','BCUT2D_MRHI','BalabanJ','EState_VSA7','fr_nitro','BCUT2D_MWHI','PEOE_VSA1','VSA_EState5',
'BCUT2D_LOGPLOW','AvgIpc','VSA_EState1','SMR_VSA1','fr_C_S','VSA_EState7','PEOE_VSA10','EState_VSA8','MaxEStateIndex',
'BCUT2D_CHGHI','PEOE_VSA8','PEOE_VSA9','PEOE_VSA5','PEOE_VSA2','MinEStateIndex','HallKierAlpha','SlogP_VSA3',
'SMR_VSA10','NumRotatableBonds','SlogP_VSA10','SMR_VSA6','EState_VSA10','SMR_VSA5','fr_NH1','EState_VSA6',
'SlogP_VSA1','SMR_VSA7','fr_NH0','VSA_EState10','FractionCSP3','fr_sulfide','SlogP_VSA4','PEOE_VSA4','fr_para_hydroxylation',
'SlogP_VSA12','NumHDonors','PEOE_VSA3','NumSaturatedHeterocycles','PEOE_VSA12']

In [116]:
target_LGBM = df['IC50, mM']
data_LGBM = df[feature_LGBM]

In [117]:
x_train_LGBM, x_valid_LGBM = train_test_split(
    data_LGBM, train_size=0.8, random_state=1
)
y_train_LGBM, y_valid_LGBM = train_test_split(
    target_LGBM, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train_LGBM.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid_LGBM.shape))

x_train.shape = 798 rows, 72 cols
x_valid.shape = 200 rows, 72 cols


In [118]:
model_LGBM = LGBMRegressor(random_state=27)
model_LGBM.fit(x_train_LGBM, y_train_LGBM, verbose=100)



LGBMRegressor(random_state=27)

In [119]:
y_pred_LGBM = model_LGBM.predict(x_train_LGBM)
y_val_LGBM = model_LGBM.predict(x_valid_LGBM)
print("Train R2: ", r2_score(y_train_LGBM, y_pred_LGBM))
print("Train RMSE: ", np.sqrt(mse(y_train_LGBM, y_pred_LGBM)))
print("Validation R2: ", r2_score(y_valid_LGBM, y_val_LGBM))
print("Validation RMSE: ", np.sqrt(mse(y_valid_LGBM, y_val_LGBM)))

Train R2:  0.8256191984268786
Train RMSE:  171.6780081089532
Validation R2:  0.5219687708615579
Validation RMSE:  244.5203060633179


Отобрав признаки мы улучшили метрики и немного снизили переобучение для модели lightgbm на вадидации R2: 0.52, RMSE: 244.52.

# Подбор гиперпараметров

**CatBoost**

In [136]:
def catboost_cv(depth, learning_rate,l2_leaf_reg):
    depth = int(depth)
    l2_leaf_reg = int(l2_leaf_reg)
    model = CatBoostRegressor(
        depth=depth,
        learning_rate=learning_rate,
        l2_leaf_reg=l2_leaf_reg,
        early_stopping_rounds=5
    )
    model.fit(x_train_cb, y_train_cb, verbose=100)

    return r2_score(y_valid_cb, model.predict(x_valid_cb))

In [137]:
param_space = {
    'depth': (3, 10),             
    'learning_rate': (0.001, 0.3),  
    'l2_leaf_reg': (1, 10)       
}

bayesian_opt = BayesianOptimization(
    f=catboost_cv, pbounds=param_space, random_state=27)
bayesian_opt.maximize(init_points=5, n_iter=20)
results = pd.DataFrame(bayesian_opt.res)
results.sort_values(by='target', ascending=False, inplace=True)

|   iter    |  target   |   depth   | l2_lea... | learni... |
-------------------------------------------------------------
0:	learn: 399.4297020	total: 11.8ms	remaining: 11.8s
100:	learn: 203.7712813	total: 416ms	remaining: 3.7s
200:	learn: 171.9736852	total: 830ms	remaining: 3.3s
300:	learn: 156.5787769	total: 1.22s	remaining: 2.84s
400:	learn: 148.2527474	total: 1.61s	remaining: 2.41s
500:	learn: 143.3444339	total: 1.98s	remaining: 1.98s
600:	learn: 140.6934172	total: 2.34s	remaining: 1.56s
700:	learn: 139.2986539	total: 2.7s	remaining: 1.15s
800:	learn: 138.2386882	total: 3.07s	remaining: 762ms
900:	learn: 137.7245231	total: 3.41s	remaining: 375ms
999:	learn: 137.4032240	total: 3.75s	remaining: 0us
| [0m1        [0m | [0m0.439    [0m | [0m5.98     [0m | [0m8.331    [0m | [0m0.2209   [0m |
0:	learn: 380.2917604	total: 74.2ms	remaining: 1m 14s
100:	learn: 144.7163415	total: 5.36s	remaining: 47.7s
200:	learn: 137.7311358	total: 10.9s	remaining: 43.4s
300:	learn: 136.9617939	

900:	learn: 201.3251765	total: 13.3s	remaining: 1.46s
999:	learn: 196.6822190	total: 14.4s	remaining: 0us
| [95m12       [0m | [95m0.5294   [0m | [95m7.822    [0m | [95m8.392    [0m | [95m0.01796  [0m |
0:	learn: 410.0259613	total: 13.9ms	remaining: 13.9s
100:	learn: 335.5651907	total: 1.27s	remaining: 11.3s
200:	learn: 296.7162409	total: 2.6s	remaining: 10.3s
300:	learn: 271.5130346	total: 3.86s	remaining: 8.96s
400:	learn: 255.8326603	total: 5.13s	remaining: 7.67s
500:	learn: 244.2761129	total: 6.32s	remaining: 6.29s
600:	learn: 234.9896025	total: 7.66s	remaining: 5.09s
700:	learn: 225.5209196	total: 8.92s	remaining: 3.8s
800:	learn: 216.6278425	total: 10.3s	remaining: 2.57s
900:	learn: 211.2284577	total: 11.5s	remaining: 1.26s
999:	learn: 205.6386992	total: 12.9s	remaining: 0us
| [0m13       [0m | [0m0.5273   [0m | [0m7.872    [0m | [0m8.379    [0m | [0m0.01483  [0m |
0:	learn: 409.7398135	total: 13ms	remaining: 13s
100:	learn: 321.9671108	total: 1.42s	remaining:

800:	learn: 205.2801619	total: 12.4s	remaining: 3.07s
900:	learn: 198.5655201	total: 13.8s	remaining: 1.51s
999:	learn: 193.9004939	total: 15.1s	remaining: 0us
| [0m24       [0m | [0m0.5345   [0m | [0m7.85     [0m | [0m8.271    [0m | [0m0.01881  [0m |
0:	learn: 409.3964431	total: 13.3ms	remaining: 13.3s
100:	learn: 311.1222617	total: 1.46s	remaining: 13s
200:	learn: 267.5889547	total: 3.09s	remaining: 12.3s
300:	learn: 246.7748233	total: 4.3s	remaining: 9.99s
400:	learn: 231.4773049	total: 5.62s	remaining: 8.39s
500:	learn: 216.9653155	total: 6.85s	remaining: 6.83s
600:	learn: 207.2237003	total: 8.22s	remaining: 5.46s
700:	learn: 201.6381295	total: 9.47s	remaining: 4.04s
800:	learn: 194.9977342	total: 10.7s	remaining: 2.65s
900:	learn: 188.1881724	total: 11.9s	remaining: 1.31s
999:	learn: 183.2743734	total: 13.2s	remaining: 0us
| [95m25       [0m | [95m0.5463   [0m | [95m7.889    [0m | [95m8.227    [0m | [95m0.02344  [0m |


In [138]:
best_hyperparameters = bayesian_opt.max
best_hyperparameters['params'] = {param: int(value) if param in [
    'depth', 'iterations', 'min_data_in_leaf'] else value for param, value in best_hyperparameters['params'].items()}
print("Best hyperparameters:", best_hyperparameters['params'])
print(f"Best R-squared Score: {best_hyperparameters['target']:.4f}")

Best hyperparameters: {'depth': 7, 'l2_leaf_reg': 8, 'learning_rate': 0.023435002204803896}
Best R-squared Score: 0.5463


In [165]:
model = CatBoostRegressor(
        depth=7,
        learning_rate=0.023435002204803896,
        l2_leaf_reg=8,
        early_stopping_rounds=5
    )
model.fit(x_train_cb, y_train_cb, verbose=100)
model.fit(x_train_cb, y_train_cb, verbose=100)
y_pred_cb = model.predict(x_train_cb)
y_val_cb = model.predict(x_valid_cb)
print("Train R2: ", r2_score(y_train_cb, y_pred_cb))
print("Train RMSE: ", np.sqrt(mse(y_train_cb, y_pred_cb)))
print("Validation R2: ", r2_score(y_valid_cb, y_val_cb))
print("Validation RMSE: ", np.sqrt(mse(y_valid_cb, y_val_cb)))

0:	learn: 409.3964431	total: 32.2ms	remaining: 32.2s
100:	learn: 311.1222617	total: 1.31s	remaining: 11.7s
200:	learn: 267.5889547	total: 2.51s	remaining: 9.97s
300:	learn: 246.7748233	total: 3.66s	remaining: 8.5s
400:	learn: 231.4773049	total: 4.8s	remaining: 7.17s
500:	learn: 216.9653155	total: 5.93s	remaining: 5.91s
600:	learn: 207.2237003	total: 7.09s	remaining: 4.71s
700:	learn: 201.6381295	total: 8.37s	remaining: 3.57s
800:	learn: 194.9977342	total: 9.65s	remaining: 2.4s
900:	learn: 188.1881724	total: 10.8s	remaining: 1.19s
999:	learn: 183.2743734	total: 12s	remaining: 0us
0:	learn: 409.3964431	total: 13.9ms	remaining: 13.9s
100:	learn: 311.1222617	total: 1.23s	remaining: 11s
200:	learn: 267.5889547	total: 2.42s	remaining: 9.63s
300:	learn: 246.7748233	total: 3.56s	remaining: 8.27s
400:	learn: 231.4773049	total: 4.71s	remaining: 7.03s
500:	learn: 216.9653155	total: 5.87s	remaining: 5.84s
600:	learn: 207.2237003	total: 7.02s	remaining: 4.66s
700:	learn: 201.6381295	total: 8.15s	re

После подбора гиперпараметров мы еще улучшили метрики и снизили переобучение для модели CatBoost на вадидации R2: 0.55, RMSE: 238.20.

**lightgbm**

In [95]:
def LGBM_cv(max_depth, learning_rate, min_data_in_leaf):
    max_depth = int(max_depth)
    min_data_in_leaf = int(min_data_in_leaf)
    model = LGBMRegressor(
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_data_in_leaf=min_data_in_leaf,          
        eval_metric  = 'r2',
        early_stopping_rounds=10
    )
    model.fit(x_train_LGBM, y_train_LGBM, eval_set = (x_valid_LGBM, y_valid_LGBM))
    return r2_score(y_valid_LGBM, model.predict(x_valid_LGBM))

In [96]:
param_space = {
    'max_depth': (3, 15),            
    'learning_rate': (0.001, 0.5),  
    'min_data_in_leaf': (1, 10)       
}

bayesian_opt = BayesianOptimization(
    f=LGBM_cv, pbounds=param_space, random_state=27)
bayesian_opt.maximize(init_points=5, n_iter=15)
results = pd.DataFrame(bayesian_opt.res)
results.sort_values(by='target', ascending=False, inplace=True)

|   iter    |  target   | learni... | max_depth | min_da... |
-------------------------------------------------------------
[1]	valid_0's l2: 82278.8
[2]	valid_0's l2: 68515.5
[3]	valid_0's l2: 62496.6
[4]	valid_0's l2: 58738.5
[5]	valid_0's l2: 55398.7
[6]	valid_0's l2: 54683.4
[7]	valid_0's l2: 54512.3
[8]	valid_0's l2: 54701.6
[9]	valid_0's l2: 54546.6
[10]	valid_0's l2: 54035.8
[11]	valid_0's l2: 53828.7
[12]	valid_0's l2: 54898.1
[13]	valid_0's l2: 53499.1
[14]	valid_0's l2: 51870.2
[15]	valid_0's l2: 51015.3
[16]	valid_0's l2: 50762.7
[17]	valid_0's l2: 49946.9
[18]	valid_0's l2: 49043.3
[19]	valid_0's l2: 48390.3
[20]	valid_0's l2: 48506.5
[21]	valid_0's l2: 48993.1
[22]	valid_0's l2: 48782.9
[23]	valid_0's l2: 47975.9
[24]	valid_0's l2: 47925.5
[25]	valid_0's l2: 48492.3
[26]	valid_0's l2: 48420.4
[27]	valid_0's l2: 48374.1
[28]	valid_0's l2: 48061.1
[29]	valid_0's l2: 47661.3
[30]	valid_0's l2: 47836.3
[31]	valid_0's l2: 47959.1
[32]	valid_0's l2: 48065.8
[33]	valid_0's l2: 48

[67]	valid_0's l2: 94503.6
[68]	valid_0's l2: 94423
[69]	valid_0's l2: 94331.4
[70]	valid_0's l2: 94252.2
[71]	valid_0's l2: 94162
[72]	valid_0's l2: 94083.3
[73]	valid_0's l2: 93994.6
[74]	valid_0's l2: 93912.2
[75]	valid_0's l2: 93828.8
[76]	valid_0's l2: 93746.9
[77]	valid_0's l2: 93653.1
[78]	valid_0's l2: 93561.7
[79]	valid_0's l2: 93477.7
[80]	valid_0's l2: 93402.7
[81]	valid_0's l2: 93315.5
[82]	valid_0's l2: 93237.2
[83]	valid_0's l2: 93151.5
[84]	valid_0's l2: 93060.4
[85]	valid_0's l2: 92965.9
[86]	valid_0's l2: 92892.3
[87]	valid_0's l2: 92794.3
[88]	valid_0's l2: 92723.3
[89]	valid_0's l2: 92627.5
[90]	valid_0's l2: 92556.9
[91]	valid_0's l2: 92447.4
[92]	valid_0's l2: 92363.1
[93]	valid_0's l2: 92282.3
[94]	valid_0's l2: 92190
[95]	valid_0's l2: 92098.4
[96]	valid_0's l2: 92003.7
[97]	valid_0's l2: 91909.8
[98]	valid_0's l2: 91840.7
[99]	valid_0's l2: 91734.7
[100]	valid_0's l2: 91652.1
| [0m8        [0m | [0m0.08529  [0m | [0m0.001    [0m | [0m12.65    [0m | [0m7

[1]	valid_0's l2: 98710
[2]	valid_0's l2: 96806.2
[3]	valid_0's l2: 94903.7
[4]	valid_0's l2: 93092.4
[5]	valid_0's l2: 91435.1
[6]	valid_0's l2: 89657.1
[7]	valid_0's l2: 88010.4
[8]	valid_0's l2: 86521.9
[9]	valid_0's l2: 84891.8
[10]	valid_0's l2: 83657.2
[11]	valid_0's l2: 82312.7
[12]	valid_0's l2: 81256.3
[13]	valid_0's l2: 79810.3
[14]	valid_0's l2: 78789
[15]	valid_0's l2: 77616.7
[16]	valid_0's l2: 76014.2
[17]	valid_0's l2: 74912.6
[18]	valid_0's l2: 73810.9
[19]	valid_0's l2: 72852.2
[20]	valid_0's l2: 71824
[21]	valid_0's l2: 70958
[22]	valid_0's l2: 70264.1
[23]	valid_0's l2: 69391.3
[24]	valid_0's l2: 68670.7
[25]	valid_0's l2: 68037.6
[26]	valid_0's l2: 67493
[27]	valid_0's l2: 66923.1
[28]	valid_0's l2: 66252.7
[29]	valid_0's l2: 65492.3
[30]	valid_0's l2: 64898.7
[31]	valid_0's l2: 64311
[32]	valid_0's l2: 63776
[33]	valid_0's l2: 63245.7
[34]	valid_0's l2: 62675
[35]	valid_0's l2: 62249.1
[36]	valid_0's l2: 61909.9
[37]	valid_0's l2: 61620.7
[38]	valid_0's l2: 61112.2

[1]	valid_0's l2: 98233.7
[2]	valid_0's l2: 96080
[3]	valid_0's l2: 93861
[4]	valid_0's l2: 91884.3
[5]	valid_0's l2: 89550.5
[6]	valid_0's l2: 87409.8
[7]	valid_0's l2: 85606.3
[8]	valid_0's l2: 83495.7
[9]	valid_0's l2: 81713.7
[10]	valid_0's l2: 80039.5
[11]	valid_0's l2: 78341.9
[12]	valid_0's l2: 76826.1
[13]	valid_0's l2: 75612.1
[14]	valid_0's l2: 73800.5
[15]	valid_0's l2: 72523.7
[16]	valid_0's l2: 71009.4
[17]	valid_0's l2: 69349.6
[18]	valid_0's l2: 68191.3
[19]	valid_0's l2: 66903.8
[20]	valid_0's l2: 65849.9
[21]	valid_0's l2: 64805.2
[22]	valid_0's l2: 64069.8
[23]	valid_0's l2: 63567.5
[24]	valid_0's l2: 63061.4
[25]	valid_0's l2: 62420.3
[26]	valid_0's l2: 61640.4
[27]	valid_0's l2: 61285.4
[28]	valid_0's l2: 60713.8
[29]	valid_0's l2: 59855.7
[30]	valid_0's l2: 58847.2
[31]	valid_0's l2: 57929.6
[32]	valid_0's l2: 57325.1
[33]	valid_0's l2: 56730.9
[34]	valid_0's l2: 56098.5
[35]	valid_0's l2: 55251.1
[36]	valid_0's l2: 54730.6
[37]	valid_0's l2: 54053.3
[38]	valid_0's

In [97]:
best_hyperparameters = bayesian_opt.max
best_hyperparameters['params'] = {param: int(value) if param in [
    'depth', 'iterations', 'min_data_in_leaf'] else value for param, value in best_hyperparameters['params'].items()}
print("Best hyperparameters:", best_hyperparameters['params'])
print(f"Best R-squared Score: {best_hyperparameters['target']:.4f}")

Best hyperparameters: {'learning_rate': 0.23282574247739554, 'max_depth': 12.918960214719949, 'min_data_in_leaf': 7.835134318880956}
Best R-squared Score: 0.5726


In [98]:
params = {
    'max_depth': 15,            
    'learning_rate': 0.2627375866654199,  
    'min_data_in_leaf': 6       
}
model_LGBM = LGBMRegressor(**params, eval_metric  = 'r2', early_stopping_rounds=10, random_state=27)
model_LGBM.fit(x_train_LGBM, y_train_LGBM, eval_set = (x_valid_LGBM, y_valid_LGBM))
y_pred_LGBM = model_LGBM.predict(x_train_LGBM)
y_val_LGBM = model_LGBM.predict(x_valid_LGBM)
print("Train R2: ", r2_score(y_train_LGBM, y_pred_LGBM))
print("Train RMSE: ", np.sqrt(mse(y_train_LGBM, y_pred_LGBM)))
print("Validation R2: ", r2_score(y_valid_LGBM, y_val_LGBM))
print("Validation RMSE: ", np.sqrt(mse(y_valid_LGBM, y_val_LGBM)))

[1]	valid_0's l2: 77626.3
[2]	valid_0's l2: 65702.9
[3]	valid_0's l2: 55493.6
[4]	valid_0's l2: 51532.6
[5]	valid_0's l2: 51329.5
[6]	valid_0's l2: 46591.2
[7]	valid_0's l2: 45239.6
[8]	valid_0's l2: 44843
[9]	valid_0's l2: 44588.9
[10]	valid_0's l2: 44407.2
[11]	valid_0's l2: 43927.7
[12]	valid_0's l2: 44002.7
[13]	valid_0's l2: 43558.1
[14]	valid_0's l2: 43931.7
[15]	valid_0's l2: 43523.9
[16]	valid_0's l2: 43955.7
[17]	valid_0's l2: 43432.7
[18]	valid_0's l2: 43258.1
[19]	valid_0's l2: 42799.4
[20]	valid_0's l2: 42864
[21]	valid_0's l2: 43413.8
[22]	valid_0's l2: 42986.7
[23]	valid_0's l2: 43080.1
[24]	valid_0's l2: 43483.5
[25]	valid_0's l2: 43577
[26]	valid_0's l2: 43946.7
[27]	valid_0's l2: 44093
[28]	valid_0's l2: 44491.4
[29]	valid_0's l2: 45094.8
Train R2:  0.8478780235873504
Train RMSE:  159.33045543675047
Validation R2:  0.5728535857033172
Validation RMSE:  206.88008569073023


После подбора гиперпараметров мы еще улучшили метрики и снизили переобучение для модели lightgbm на вадидации R2: 0.57, RMSE: 206.88.

**Выводы**

Мы сравнили 4 регрессионые модели: DecisionTreeRegressor, CatBoostRegressor, LGBMRegressor, XGBRegressor. Наилучший результат и наименьшее переобучение показала модель lightgbm с результатом на трейне: r2=0.84 и RMSE=159.33 и тесте: r2=0.57 и RMSE=206.88. Данная модель объясняет 57% дисперсии между тренеровочными и тестовыми данными и корень среднеквадратичной ощибки составляет 206.88 микромоль (мкМ).

**Рекомендации**

Для улучшения значения метрики и снижения переобучения нужно будет увеличить количество данных. Также можно попробовать логафрифмировать целевую переменную.