In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import missingno as msno
import seaborn as sns
import scipy.stats as st
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xg
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from eli5.sklearn import PermutationImportance
from eli5 import show_weights
from sklearn.tree import DecisionTreeRegressor
from bayes_opt import BayesianOptimization

In [4]:
#Загрузим данные из файла.
df = pd.read_excel('/home/rik/Рабочий стол/МИФИ/учеба/мо/curse/data.xlsx', index_col=0)
df.head()

Unnamed: 0,"IC50, mM","CC50, mM",SI,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.239374,175.482382,28.125,5.094096,5.094096,0.387225,0.387225,0.417362,42.928571,384.652,...,0,0,0,0,0,0,0,0,3,0
1,0.771831,5.402819,7.0,3.961417,3.961417,0.533868,0.533868,0.462473,45.214286,388.684,...,0,0,0,0,0,0,0,0,3,0
2,223.808778,161.14232,0.72,2.627117,2.627117,0.543231,0.543231,0.260923,42.1875,446.808,...,0,0,0,0,0,0,0,0,3,0
3,1.705624,107.855654,63.235294,5.09736,5.09736,0.390603,0.390603,0.377846,41.862069,398.679,...,0,0,0,0,0,0,0,0,4,0
4,107.131532,139.270991,1.3,5.15051,5.15051,0.270476,0.270476,0.429038,36.514286,466.713,...,0,0,0,0,0,0,0,0,0,0


In [5]:
print("train.shape = {} rows, {} cols".format(*df.shape))

train.shape = 1001 rows, 213 cols


# Предобработка данных.

In [6]:
def transform(df): 
    discrete_feature = [feature for feature in df.columns if len(df[feature].unique())<25]
    count_discrete_feature = {}
    for i in tqdm(discrete_feature):
        cnt = 0
        for j in df[i]:
            if j != 0:
                cnt += 1
        count_discrete_feature[f'{i}'] = cnt
    count_discrete_feature = pd.Series(count_discrete_feature)
    df = df.drop(columns=count_discrete_feature[count_discrete_feature == 0].index.to_list())
    df = df.dropna()
    feature_del = ['BertzCT', 'Chi0', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v',
     'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan3', 'HeavyAtomCount', 'HeavyAtomMolWt', 'Kappa1', 'Kappa2',
     'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 'MolMR', 'MolWt', 'NHOHCount', 'NumAromaticCarbocycles',
     'NumHAcceptors', 'NumHeteroatoms', 'NumSaturatedCarbocycles', 'NumValenceElectrons', 'SMR_VSA4', 'SMR_VSA9',
     'SlogP_VSA6', 'TPSA', 'VSA_EState2', 'VSA_EState3', 'VSA_EState6', 'fr_Al_OH_noTert', 'fr_COO', 'fr_COO2',
     'fr_C_O_noCOO', 'fr_Nhpyrrole', 'fr_benzene', 'fr_nitro_arom_nonortho', 'fr_phenol', 'fr_phenol_noOrthoHbond',
    'MinAbsPartialCharge', 'Chi2n']
    df = df.drop(columns=feature_del)
    df =  df[df['SI'] < 250]
    
    return df
    

In [7]:
df = transform(df)

100%|███████████████████████████████████████| 110/110 [00:00<00:00, 7146.10it/s]


## Обучение нескольких моделей, их сравнение. 

In [8]:
target = df['SI']
data = df.drop(['CC50, mM','IC50, mM','SI'], axis=1)

In [9]:
x_train, x_valid = train_test_split(
    data, train_size=0.8, random_state=1
)
y_train, y_valid = train_test_split(
    target, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))

x_train.shape = 776 rows, 144 cols
x_valid.shape = 195 rows, 144 cols


In [10]:
# Создадим модель дерева решений
tree = DecisionTreeRegressor(random_state=27)
# Создадим модель CatBoost
model = CatBoostRegressor(random_state=27)
# Создадим модель lightgbm
model_LGBM = LGBMRegressor(random_state=27)
# Создадим модель xgboost
xgb_r = xg.XGBRegressor() 
# обучение моделей
tree.fit(x_train, y_train)
model.fit(x_train, y_train, verbose=0)
model_LGBM.fit(x_train, y_train, verbose=0)
xgb_r.fit(x_train, y_train, verbose=0)
# предсказание ответов для тренеровочной выборки
y_pred_tree_train = tree.predict(x_train)
y_pred_сat_Boost_train = model.predict(x_train)
y_pred_LGBM_train = model_LGBM.predict(x_train)
y_pred_xgb_train = xgb_r.predict(x_train)
# предсказание ответов для тестовой выборки
y_pred_tree = tree.predict(x_valid)
y_pred_сat_Boost = model.predict(x_valid)
y_pred_LGBM = model_LGBM.predict(x_valid)
y_pred_xgb = xgb_r.predict(x_valid)

print("Train tree R2: ", r2_score(y_train, y_pred_tree_train))
print("Train tree RMSE: ", np.sqrt(mse(y_train, y_pred_tree_train)))
print("Validation tree R2: ", r2_score(y_valid, y_pred_tree))
print("Validation tree RMSE: ", np.sqrt(mse(y_valid, y_pred_tree)))
print('*' * 50)
print("Train сat_Boost R2: ", r2_score(y_train, y_pred_сat_Boost_train))
print("Train сat_Boost RMSE: ", np.sqrt(mse(y_train, y_pred_сat_Boost_train)))
print("Validation сat_Boost R2: ", r2_score(y_valid, y_pred_сat_Boost))
print("Validation сat_Boost RMSE: ", np.sqrt(mse(y_valid, y_pred_сat_Boost)))
print('*' * 50)
print("Train LGBM R2: ", r2_score(y_train, y_pred_LGBM_train))
print("Train LGBM RMSE: ", np.sqrt(mse(y_train, y_pred_LGBM_train)))
print("Validation LGBM R2: ", r2_score(y_valid, y_pred_LGBM))
print("Validation LGBM RMSE: ", np.sqrt(mse(y_valid, y_pred_LGBM)))
print('*' * 50)
print("Train xgb R2: ", r2_score(y_train, y_pred_xgb_train))
print("Train xgb RMSE: ", np.sqrt(mse(y_train, y_pred_xgb_train)))
print("Validation xgb R2: ", r2_score(y_valid, y_pred_xgb))
print("Validation xgb RMSE: ", np.sqrt(mse(y_valid, y_pred_xgb)))



Train tree R2:  0.7853512323114229
Train tree RMSE:  13.201754721573199
Validation tree R2:  -0.06496199201884623
Validation tree RMSE:  40.10073909433003
**************************************************
Train сat_Boost R2:  0.7575471416484366
Train сat_Boost RMSE:  14.030757356287289
Validation сat_Boost R2:  0.28260186060565406
Validation сat_Boost RMSE:  32.91284662794601
**************************************************
Train LGBM R2:  0.7307946478917475
Train LGBM RMSE:  14.7845906628946
Validation LGBM R2:  0.2761200143134499
Validation LGBM RMSE:  33.06119960083822
**************************************************
Train xgb R2:  0.7852165709765964
Train xgb RMSE:  13.205895176689957
Validation xgb R2:  0.24542893428645796
Validation xgb RMSE:  33.75478895627341


Из приведенных данных выше мы видем низкое качество моделей, а также переобучение. Наилучшие результаты показали модели сat_Boost и LGBM. С ними и продолжим работать.

# Подбор признаков, их анализ и оценка важности.

**CatBoost**

In [11]:
col = x_train.columns.tolist()
estimator_cat_boost = PermutationImportance(
    model, scoring= "neg_mean_squared_error", random_state=27
)
importance_cat_boost = estimator_cat_boost.fit(x_train, y_train)

In [12]:
show_weights(importance_cat_boost, feature_names=col, top=x_train.shape[1])

Weight,Feature
28.5388  ± 8.6523,BCUT2D_LOGPHI
26.3721  ± 4.3332,BCUT2D_MRLOW
20.7108  ± 4.8897,FractionCSP3
20.0227  ± 4.8797,BCUT2D_CHGHI
19.2817  ± 4.2424,MaxPartialCharge
17.9971  ± 3.7007,Ipc
16.0440  ± 3.2527,fr_aldehyde
14.5020  ± 5.0065,SMR_VSA7
14.1764  ± 5.9535,VSA_EState4
13.7340  ± 2.4645,BCUT2D_CHGLO


Оставим только те признаки которые которые улучшают работу модели. И посмотрим как изменится метрика

In [13]:
feature_cat_boost = ['BCUT2D_LOGPHI','BCUT2D_MRLOW','FractionCSP3','BCUT2D_CHGHI','MaxPartialCharge','Ipc',
'fr_aldehyde','SMR_VSA7','VSA_EState4','BCUT2D_CHGLO','qed','PEOE_VSA7','VSA_EState8','AvgIpc','BalabanJ','BCUT2D_LOGPLOW',
'VSA_EState7','SlogP_VSA5','MinPartialCharge','fr_unbrch_alkane','PEOE_VSA6','FpDensityMorgan2','EState_VSA7',
'MaxEStateIndex','EState_VSA5','EState_VSA4','PEOE_VSA10','VSA_EState5','PEOE_VSA9','SlogP_VSA4','BCUT2D_MWHI',
'PEOE_VSA3','SMR_VSA6','MinAbsEStateIndex','MinEStateIndex','PEOE_VSA8','fr_methoxy','VSA_EState9','SPS',
'fr_urea','SlogP_VSA3','EState_VSA9','VSA_EState10','RingCount','EState_VSA8','PEOE_VSA1','BCUT2D_MWLOW','BCUT2D_MRHI',
'SlogP_VSA2','MolLogP','EState_VSA3','Chi0n','SlogP_VSA11','VSA_EState1','NumRotatableBonds','EState_VSA6','PEOE_VSA11',
'SMR_VSA5','SlogP_VSA1','EState_VSA10','fr_Imine','fr_amide','SMR_VSA10','HallKierAlpha','fr_bicyclic','SMR_VSA1',
'fr_sulfonamd','EState_VSA2','fr_oxime','fr_para_hydroxylation','NOCount','NumHDonors','NumAliphaticRings',
'PEOE_VSA2','fr_priamide','fr_morpholine','NumAliphaticCarbocycles','SMR_VSA2','SlogP_VSA8','fr_C_O','fr_Ar_NH',
'PEOE_VSA14','NumSaturatedHeterocycles']

In [14]:
target = df['SI']
data_cb = data[feature_cat_boost]

In [15]:
x_train_cb, x_valid_cb = train_test_split(
    data_cb, train_size=0.8, random_state=1
)
y_train_cb, y_valid_cb = train_test_split(
    target, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train_cb.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid_cb.shape))

x_train.shape = 776 rows, 83 cols
x_valid.shape = 195 rows, 83 cols


In [16]:
# Создадим модель CatBoost
model = CatBoostRegressor(random_state=27)
# обучение моделей
model.fit(x_train_cb, y_train_cb, verbose=100)
# предсказание ответов для тренеровочной выборки
y_pred_сat_Boost_train = model.predict(x_train_cb)
# предсказание ответов для тестовой выборки
y_pred_сat_Boost = model.predict(x_valid_cb)


print("Train сat_Boost R2: ", r2_score(y_train, y_pred_сat_Boost_train))
print("Train сat_Boost RMSE: ", np.sqrt(mse(y_train, y_pred_сat_Boost_train)))
print("Validation сat_Boost R2: ", r2_score(y_valid, y_pred_сat_Boost))
print("Validation сat_Boost RMSE: ", np.sqrt(mse(y_valid, y_pred_сat_Boost)))


Learning rate set to 0.039335
0:	learn: 28.3453583	total: 26.7ms	remaining: 26.7s
100:	learn: 21.6273821	total: 799ms	remaining: 7.11s
200:	learn: 19.6355922	total: 1.57s	remaining: 6.24s
300:	learn: 18.1347459	total: 2.37s	remaining: 5.5s
400:	learn: 16.9500338	total: 3.13s	remaining: 4.68s
500:	learn: 16.0855615	total: 3.8s	remaining: 3.79s
600:	learn: 15.4529743	total: 4.55s	remaining: 3.02s
700:	learn: 14.8884506	total: 5.31s	remaining: 2.26s
800:	learn: 14.4763212	total: 6.1s	remaining: 1.52s
900:	learn: 14.1840655	total: 6.9s	remaining: 758ms
999:	learn: 13.9520259	total: 7.59s	remaining: 0us
Train сat_Boost R2:  0.760260477694893
Train сat_Boost RMSE:  13.952026025265265
Validation сat_Boost R2:  0.2930092847667485
Validation сat_Boost RMSE:  32.67323814202681


Отобрав признаки мы немного улучшили метрики для модели CatBoost на вадидации R2: 0.29, RMSE: 32.67.

**lightgbm**

In [17]:
col = x_train.columns.tolist()
estimator_LGBM = PermutationImportance(
    model_LGBM, scoring= "neg_mean_squared_error", random_state=27
)
importance_LGBM = estimator_LGBM.fit(x_train, y_train)


In [18]:
show_weights(importance_LGBM, feature_names=col, top=x_train.shape[1])

Weight,Feature
207.2388  ± 28.4792,BCUT2D_LOGPHI
114.6838  ± 9.8871,BCUT2D_CHGHI
66.8938  ± 13.6866,FractionCSP3
54.4499  ± 5.9538,BCUT2D_MRLOW
38.5593  ± 9.1944,EState_VSA8
31.3051  ± 5.7337,BCUT2D_LOGPLOW
29.1320  ± 6.3528,SMR_VSA7
28.9441  ± 6.0977,VSA_EState4
28.6507  ± 6.3326,AvgIpc
26.6182  ± 4.1667,VSA_EState8


Оставим только те признаки которые которые улучшают работу модели. И посмотрим как изменится метрика

In [19]:
feature_LGBM = ['BCUT2D_LOGPHI','BCUT2D_CHGHI','FractionCSP3','BCUT2D_MRLOW','EState_VSA8','BCUT2D_LOGPLOW',
'SMR_VSA7','VSA_EState4','AvgIpc','VSA_EState8','BCUT2D_MWHI','MaxEStateIndex','qed','PEOE_VSA7','BalabanJ',
'NumHDonors','EState_VSA5','SlogP_VSA5','FpDensityMorgan2','VSA_EState7','VSA_EState5','NumRotatableBonds',
'MinPartialCharge','SlogP_VSA1','BCUT2D_MRHI','MinAbsEStateIndex','PEOE_VSA6','EState_VSA4','MaxPartialCharge',
'MinEStateIndex','MolLogP','Chi0n','PEOE_VSA11','EState_VSA2','BCUT2D_MWLOW','PEOE_VSA3','HallKierAlpha',
'SMR_VSA5','SlogP_VSA2','BCUT2D_CHGLO','EState_VSA3','VSA_EState1','PEOE_VSA9','EState_VSA9','Ipc','fr_amide',
'SPS','fr_methoxy','SMR_VSA10','NumAliphaticHeterocycles','PEOE_VSA10','EState_VSA6','PEOE_VSA8','PEOE_VSA1',
'fr_Al_OH','EState_VSA7','SMR_VSA6','SlogP_VSA3','SMR_VSA3','PEOE_VSA2','fr_NH1','VSA_EState9','EState_VSA1',
'fr_NH2','SlogP_VSA4','fr_priamide','VSA_EState10','EState_VSA10','RingCount','SMR_VSA1','SlogP_VSA10','fr_Ar_NH',
'fr_unbrch_alkane','NumSaturatedRings','fr_para_hydroxylation','fr_aryl_methyl','fr_ether','PEOE_VSA12',
'SlogP_VSA8','fr_allylic_oxid','fr_C_O','PEOE_VSA5','fr_alkyl_halide','PEOE_VSA4','NumAliphaticCarbocycles',
'PEOE_VSA14','fr_bicyclic','fr_thiophene','SlogP_VSA11','NumAliphaticRings','fr_Ndealkylation1']

In [20]:
target_LGBM = df['SI']
data_LGBM = df[feature_LGBM]

In [21]:
x_train_LGBM, x_valid_LGBM = train_test_split(
    data_LGBM, train_size=0.8, random_state=1
)
y_train_LGBM, y_valid_LGBM = train_test_split(
    target_LGBM, train_size=0.8, random_state=1
)
print("x_train.shape = {} rows, {} cols".format(*x_train_LGBM.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid_LGBM.shape))

x_train.shape = 776 rows, 91 cols
x_valid.shape = 195 rows, 91 cols


In [22]:
model_LGBM = LGBMRegressor(random_state=27)
model_LGBM.fit(x_train_LGBM, y_train_LGBM, verbose=100)



LGBMRegressor(random_state=27)

In [23]:
y_pred_LGBM = model_LGBM.predict(x_train_LGBM)
y_val_LGBM = model_LGBM.predict(x_valid_LGBM)
print("Train R2: ", r2_score(y_train_LGBM, y_pred_LGBM))
print("Train RMSE: ", np.sqrt(mse(y_train_LGBM, y_pred_LGBM)))
print("Validation R2: ", r2_score(y_valid_LGBM, y_val_LGBM))
print("Validation RMSE: ", np.sqrt(mse(y_valid_LGBM, y_val_LGBM)))

Train R2:  0.7303942542588666
Train RMSE:  14.795581262550755
Validation R2:  0.29354599698615036
Validation RMSE:  32.66083383799346


Отобрав признаки мы улучшили метрики для модели lightgbm на вадидации R2: 0.29, RMSE: 32.66.

# Подбор гиперпараметров

**CatBoost**

In [24]:
def catboost_cv(depth, learning_rate,n_estimators):
    depth = int(depth)
    n_estimators = int(n_estimators)
    model = CatBoostRegressor(
        depth=depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        early_stopping_rounds=10,
        random_seed=27
    )
    model.fit(x_train_cb, y_train_cb, verbose=100)

    return r2_score(y_valid_cb, model.predict(x_valid_cb))

In [25]:
param_space = {
    'depth': (5, 8),             
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 500)
}

bayesian_opt = BayesianOptimization(
    f=catboost_cv, pbounds=param_space, random_state=27)
bayesian_opt.maximize(init_points=5, n_iter=20)
results = pd.DataFrame(bayesian_opt.res)
results.sort_values(by='target', ascending=False, inplace=True)

|   iter    |  target   |   depth   | learni... | n_esti... |
-------------------------------------------------------------
0:	learn: 27.6224652	total: 10.6ms	remaining: 4.01s
100:	learn: 14.9795517	total: 813ms	remaining: 2.24s
200:	learn: 13.6080618	total: 1.63s	remaining: 1.45s
300:	learn: 13.3149273	total: 2.42s	remaining: 634ms
379:	learn: 13.2463211	total: 3.03s	remaining: 0us
| [0m1        [0m | [0m0.23     [0m | [0m6.277    [0m | [0m0.2462   [0m | [0m380.9    [0m |
0:	learn: 28.0438290	total: 14.9ms	remaining: 7.26s
100:	learn: 17.2746516	total: 1.32s	remaining: 5.1s
200:	learn: 14.5464402	total: 2.74s	remaining: 3.94s
300:	learn: 13.6815032	total: 4.28s	remaining: 2.69s
400:	learn: 13.4161528	total: 5.83s	remaining: 1.29s
489:	learn: 13.3089233	total: 6.98s	remaining: 0us
| [95m2        [0m | [95m0.268    [0m | [95m7.604    [0m | [95m0.1212   [0m | [95m490.8    [0m |
0:	learn: 28.2269034	total: 14.6ms	remaining: 5.58s
100:	learn: 19.1533749	total: 1.25s	rem

100:	learn: 15.6202572	total: 1.28s	remaining: 419ms
133:	learn: 14.6305998	total: 1.73s	remaining: 0us
| [0m25       [0m | [0m0.2724   [0m | [0m7.437    [0m | [0m0.1776   [0m | [0m134.1    [0m |


In [26]:
best_hyperparameters = bayesian_opt.max
best_hyperparameters['params'] = {param: int(value) if param in [
    'depth', 'iterations', 'n_estimators'] else value for param, value in best_hyperparameters['params'].items()}
print("Best hyperparameters:", best_hyperparameters['params'])
print(f"Best R-squared Score: {best_hyperparameters['target']:.4f}")

Best hyperparameters: {'depth': 6, 'learning_rate': 0.1232383002229961, 'n_estimators': 384.0028483013008}
Best R-squared Score: 0.2936


In [27]:
params = {
    'depth': 6,             
    'learning_rate': 0.1232383002229961, 
    'n_estimators': 384
}
model = CatBoostRegressor(**params, random_state=27)
model.fit(x_train_cb, y_train_cb, verbose=100)
y_pred_cb = model.predict(x_train_cb)
y_val_cb = model.predict(x_valid_cb)
print("Train R2: ", r2_score(y_train_cb, y_pred_cb))
print("Train RMSE: ", np.sqrt(mse(y_train_cb, y_pred_cb)))
print("Validation R2: ", r2_score(y_valid_cb, y_val_cb))
print("Validation RMSE: ", np.sqrt(mse(y_valid_cb, y_val_cb)))

0:	learn: 28.0390753	total: 10ms	remaining: 3.84s
100:	learn: 17.6617970	total: 706ms	remaining: 1.98s
200:	learn: 15.1185245	total: 1.43s	remaining: 1.3s
300:	learn: 14.0892049	total: 2.12s	remaining: 586ms
383:	learn: 13.6840060	total: 2.69s	remaining: 0us
Train R2:  0.7693828497003066
Train RMSE:  13.684006145052464
Validation R2:  0.29358405431428536
Validation RMSE:  32.659954091537315


После подбора гиперпараметров мы незначительно улучшили метрики модели CatBoost на вадидации R2: 0.29, RMSE: 32.66.

**lightgbm**

In [28]:
def LGBM_cv(max_depth, learning_rate, n_estimators):
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)
    model = LGBMRegressor(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,          
        eval_metric  = 'r2',
        early_stopping_rounds=10,
        random_seed=27
    )
    model.fit(x_train_LGBM, y_train_LGBM, eval_set = (x_valid_LGBM, y_valid_LGBM))
    return r2_score(y_valid_LGBM, model.predict(x_valid_LGBM))

In [29]:
param_space = {
    'max_depth': (5, 8),             
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 500)
}
bayesian_opt = BayesianOptimization(
    f=LGBM_cv, pbounds=param_space, random_state=27)
bayesian_opt.maximize(init_points=5, n_iter=15)
results = pd.DataFrame(bayesian_opt.res)
results.sort_values(by='target', ascending=False, inplace=True)

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
[1]	valid_0's l2: 1461.02
[2]	valid_0's l2: 1425.69
[3]	valid_0's l2: 1359.21
[4]	valid_0's l2: 1320.94
[5]	valid_0's l2: 1301.5
[6]	valid_0's l2: 1282.08
[7]	valid_0's l2: 1266.03
[8]	valid_0's l2: 1226.25
[9]	valid_0's l2: 1215.38
[10]	valid_0's l2: 1198.93
[11]	valid_0's l2: 1187.43
[12]	valid_0's l2: 1179.95
[13]	valid_0's l2: 1179.93
[14]	valid_0's l2: 1176.43
[15]	valid_0's l2: 1165.39
[16]	valid_0's l2: 1161.65
[17]	valid_0's l2: 1154.69
[18]	valid_0's l2: 1136.47
[19]	valid_0's l2: 1141.21
[20]	valid_0's l2: 1139.14
[21]	valid_0's l2: 1141.58
[22]	valid_0's l2: 1144.47
[23]	valid_0's l2: 1147.52
[24]	valid_0's l2: 1138.97
[25]	valid_0's l2: 1139.83
[26]	valid_0's l2: 1138.67
[27]	valid_0's l2: 1135.86
[28]	valid_0's l2: 1134.63
[29]	valid_0's l2: 1133.21
[30]	valid_0's l2: 1134.9
[31]	valid_0's l2: 1129.82
[32]	valid_0's l2: 1128.67
[33]	valid_0's l2: 1118

[1]	valid_0's l2: 1419.59
[2]	valid_0's l2: 1349.15
[3]	valid_0's l2: 1321.46
[4]	valid_0's l2: 1293.75
[5]	valid_0's l2: 1251.56
[6]	valid_0's l2: 1188.31
[7]	valid_0's l2: 1170.52
[8]	valid_0's l2: 1145.34
[9]	valid_0's l2: 1118.83
[10]	valid_0's l2: 1122.13
[11]	valid_0's l2: 1117.64
[12]	valid_0's l2: 1129.54
[13]	valid_0's l2: 1120.41
[14]	valid_0's l2: 1122.11
[15]	valid_0's l2: 1089.63
[16]	valid_0's l2: 1086.14
[17]	valid_0's l2: 1085.11
[18]	valid_0's l2: 1086.35
[19]	valid_0's l2: 1089.99
[20]	valid_0's l2: 1088.73
[21]	valid_0's l2: 1084.62
[22]	valid_0's l2: 1079.85
[23]	valid_0's l2: 1080.55
[24]	valid_0's l2: 1076.18
[25]	valid_0's l2: 1070.49
[26]	valid_0's l2: 1072.39
[27]	valid_0's l2: 1075.17
[28]	valid_0's l2: 1071.37
[29]	valid_0's l2: 1072.91
[30]	valid_0's l2: 1072.08
[31]	valid_0's l2: 1072.38
[32]	valid_0's l2: 1079.03
[33]	valid_0's l2: 1077.71
[34]	valid_0's l2: 1068.45
[35]	valid_0's l2: 1065.14
[36]	valid_0's l2: 1066.76
[37]	valid_0's l2: 1063.24
[38]	valid

[98]	valid_0's l2: 1268.82
[99]	valid_0's l2: 1268.06
[100]	valid_0's l2: 1266.94
[101]	valid_0's l2: 1265.14
[102]	valid_0's l2: 1264.52
[103]	valid_0's l2: 1263.92
[104]	valid_0's l2: 1263.16
[105]	valid_0's l2: 1262.18
[106]	valid_0's l2: 1260.76
[107]	valid_0's l2: 1260.18
[108]	valid_0's l2: 1259.14
[109]	valid_0's l2: 1258.23
[110]	valid_0's l2: 1257.32
[111]	valid_0's l2: 1255.94
[112]	valid_0's l2: 1254.52
[113]	valid_0's l2: 1253.11
[114]	valid_0's l2: 1252.79
[115]	valid_0's l2: 1250.87
[116]	valid_0's l2: 1249.93
[117]	valid_0's l2: 1248.98
[118]	valid_0's l2: 1248.15
[119]	valid_0's l2: 1247.26
[120]	valid_0's l2: 1246.29
[121]	valid_0's l2: 1245.76
[122]	valid_0's l2: 1245.1
[123]	valid_0's l2: 1244.32
[124]	valid_0's l2: 1243.45
[125]	valid_0's l2: 1242.57
[126]	valid_0's l2: 1241.48
[127]	valid_0's l2: 1239.66
[128]	valid_0's l2: 1238.27
[129]	valid_0's l2: 1238.29
[130]	valid_0's l2: 1237.67
[131]	valid_0's l2: 1236.88
[132]	valid_0's l2: 1235.89
[133]	valid_0's l2: 123

[15]	valid_0's l2: 1072.84
[16]	valid_0's l2: 1091.2
[17]	valid_0's l2: 1095
[18]	valid_0's l2: 1094.59
[19]	valid_0's l2: 1071
[20]	valid_0's l2: 1069.9
[21]	valid_0's l2: 1070.73
[22]	valid_0's l2: 1067.11
[23]	valid_0's l2: 1085.4
| [0m13       [0m | [0m0.2953   [0m | [0m0.3      [0m | [0m5.828    [0m | [0m268.0    [0m |
[1]	valid_0's l2: 1517.32
[2]	valid_0's l2: 1502.71
[3]	valid_0's l2: 1487.89
[4]	valid_0's l2: 1474.8
[5]	valid_0's l2: 1467.04
[6]	valid_0's l2: 1451.82
[7]	valid_0's l2: 1445.51
[8]	valid_0's l2: 1434.14
[9]	valid_0's l2: 1425.02
[10]	valid_0's l2: 1414
[11]	valid_0's l2: 1411.25
[12]	valid_0's l2: 1404.28
[13]	valid_0's l2: 1392.36
[14]	valid_0's l2: 1383.55
[15]	valid_0's l2: 1374.21
[16]	valid_0's l2: 1366.01
[17]	valid_0's l2: 1358.36
[18]	valid_0's l2: 1347.22
[19]	valid_0's l2: 1340.24
[20]	valid_0's l2: 1331.17
[21]	valid_0's l2: 1325.19
[22]	valid_0's l2: 1318.23
[23]	valid_0's l2: 1311.55
[24]	valid_0's l2: 1308.36
[25]	valid_0's l2: 1302.78
[2

[160]	valid_0's l2: 1169
[161]	valid_0's l2: 1168.87
[162]	valid_0's l2: 1166.62
[163]	valid_0's l2: 1166.44
[164]	valid_0's l2: 1166.01
[165]	valid_0's l2: 1165.98
[166]	valid_0's l2: 1165.5
[167]	valid_0's l2: 1165.45
[168]	valid_0's l2: 1164.35
[169]	valid_0's l2: 1164.29
[170]	valid_0's l2: 1163.47
[171]	valid_0's l2: 1163.01
[172]	valid_0's l2: 1162.24
[173]	valid_0's l2: 1161.96
[174]	valid_0's l2: 1161.93
[175]	valid_0's l2: 1161.55
[176]	valid_0's l2: 1159.98
[177]	valid_0's l2: 1159.67
[178]	valid_0's l2: 1157.46
[179]	valid_0's l2: 1156.7
[180]	valid_0's l2: 1156.31
[181]	valid_0's l2: 1156.36
[182]	valid_0's l2: 1156.15
[183]	valid_0's l2: 1155.5
[184]	valid_0's l2: 1155.17
[185]	valid_0's l2: 1154.46
[186]	valid_0's l2: 1153.75
[187]	valid_0's l2: 1153
[188]	valid_0's l2: 1152.38
[189]	valid_0's l2: 1152
[190]	valid_0's l2: 1151.65
[191]	valid_0's l2: 1150.56
[192]	valid_0's l2: 1150.4
[193]	valid_0's l2: 1150.44
[194]	valid_0's l2: 1149.46
[195]	valid_0's l2: 1149.33
[196]

[1]	valid_0's l2: 1483.46
[2]	valid_0's l2: 1459.98
[3]	valid_0's l2: 1422.43
[4]	valid_0's l2: 1403.08
[5]	valid_0's l2: 1375.31
[6]	valid_0's l2: 1352.46
[7]	valid_0's l2: 1329.1
[8]	valid_0's l2: 1301.95
[9]	valid_0's l2: 1282.71
[10]	valid_0's l2: 1275.46
[11]	valid_0's l2: 1270.14
[12]	valid_0's l2: 1256.42
[13]	valid_0's l2: 1248.84
[14]	valid_0's l2: 1244.78
[15]	valid_0's l2: 1232.24
[16]	valid_0's l2: 1226.29
[17]	valid_0's l2: 1224.46
[18]	valid_0's l2: 1216.78
[19]	valid_0's l2: 1212.43
[20]	valid_0's l2: 1208.25
[21]	valid_0's l2: 1206.06
[22]	valid_0's l2: 1198.99
[23]	valid_0's l2: 1197.23
[24]	valid_0's l2: 1196.56
[25]	valid_0's l2: 1188.42
[26]	valid_0's l2: 1181.23
[27]	valid_0's l2: 1179.73
[28]	valid_0's l2: 1176.08
[29]	valid_0's l2: 1177.34
[30]	valid_0's l2: 1178.85
[31]	valid_0's l2: 1169.82
[32]	valid_0's l2: 1166.27
[33]	valid_0's l2: 1165.43
[34]	valid_0's l2: 1165.63
[35]	valid_0's l2: 1166.23
[36]	valid_0's l2: 1157.03
[37]	valid_0's l2: 1145.72
[38]	valid_

In [30]:
best_hyperparameters = bayesian_opt.max
best_hyperparameters['params'] = {param: int(value) if param in [
    'depth', 'iterations', 'n_estimators'] else value for param, value in best_hyperparameters['params'].items()}
print("Best hyperparameters:", best_hyperparameters['params'])
print(f"Best R-squared Score: {best_hyperparameters['target']:.4f}")

Best hyperparameters: {'learning_rate': 0.2266081897853194, 'max_depth': 5.754556550446216, 'n_estimators': 495.63868474115566}
Best R-squared Score: 0.3055


In [31]:
params = {
    'max_depth': 5,            
    'learning_rate': 0.2266081897853194,  
    'n_estimators':495     
}
model_LGBM = LGBMRegressor(**params, eval_metric  = 'r2', early_stopping_rounds=10, random_state=27)
model_LGBM.fit(x_train_LGBM, y_train_LGBM, eval_set = (x_valid_LGBM, y_valid_LGBM))
y_pred_LGBM = model_LGBM.predict(x_train_LGBM)
y_val_LGBM = model_LGBM.predict(x_valid_LGBM)
print("Train R2: ", r2_score(y_train_LGBM, y_pred_LGBM))
print("Train RMSE: ", np.sqrt(mse(y_train_LGBM, y_pred_LGBM)))
print("Validation R2: ", r2_score(y_valid_LGBM, y_val_LGBM))
print("Validation RMSE: ", np.sqrt(mse(y_valid_LGBM, y_val_LGBM)))

[1]	valid_0's l2: 1419.59
[2]	valid_0's l2: 1349.15
[3]	valid_0's l2: 1321.46
[4]	valid_0's l2: 1293.75
[5]	valid_0's l2: 1251.56
[6]	valid_0's l2: 1188.31
[7]	valid_0's l2: 1170.52
[8]	valid_0's l2: 1145.34
[9]	valid_0's l2: 1118.83
[10]	valid_0's l2: 1122.13
[11]	valid_0's l2: 1117.64
[12]	valid_0's l2: 1129.54
[13]	valid_0's l2: 1120.41
[14]	valid_0's l2: 1122.11
[15]	valid_0's l2: 1089.63
[16]	valid_0's l2: 1086.14
[17]	valid_0's l2: 1085.11
[18]	valid_0's l2: 1086.35
[19]	valid_0's l2: 1089.99
[20]	valid_0's l2: 1088.73
[21]	valid_0's l2: 1084.62
[22]	valid_0's l2: 1079.85
[23]	valid_0's l2: 1080.55
[24]	valid_0's l2: 1076.18
[25]	valid_0's l2: 1070.49
[26]	valid_0's l2: 1072.39
[27]	valid_0's l2: 1075.17
[28]	valid_0's l2: 1071.37
[29]	valid_0's l2: 1072.91
[30]	valid_0's l2: 1072.08
[31]	valid_0's l2: 1072.38
[32]	valid_0's l2: 1079.03
[33]	valid_0's l2: 1077.71
[34]	valid_0's l2: 1068.45
[35]	valid_0's l2: 1065.14
[36]	valid_0's l2: 1066.76
[37]	valid_0's l2: 1063.24
[38]	valid

После подбора гиперпараметров мы еще улучшили метрики и снизили переобучение для модели lightgbm на вадидации R2: 0.31, RMSE: 32.38.

**Выводы**

Мы сравнили 4 регрессионые модели: DecisionTreeRegressor, CatBoostRegressor, LGBMRegressor, XGBRegressor. Наилучший результат и наименьшее переобучение показала модель lightgbm с результатом на трейне: r2= 0.70 и RMSE= 15.54 и тесте: r2=0.31 и RMSE=32.38. Данная модель объясняет 32% дисперсии между тренеровочными и тестовыми данными и корень среднеквадратичной ощибки составляет 32.38 микромоль (мкМ).

**Рекомендации**

Для улучшения значения метрики и снижения переобучения нужно будет увеличить количество данных. Также можно попробовать логафрифмировать целевую переменную.