In [1]:
import os
from scipy.stats import randint
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import joblib
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
import pandas as pd
import numpy as np


# Путь к папке с файлами
folder_path = 'Clean_data'

# Список файлов в папке
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Проверка на наличие CSV файлов
if not files:
    print("В папке нет CSV файлов!")
else:
    print(f"Найдено {len(files)} CSV файлов в папке.")

    # Загрузка первого файла для инициализации
    merged_data = pd.read_csv(os.path.join(folder_path, files[0]))

    # Объединение остальных файлов по ключу ID
    for file in files[1:]:
        data = pd.read_csv(os.path.join(folder_path, file))

        # Добавляем префикс к столбцам перед объединением
        prefix = os.path.splitext(file)[0] + '_'
        data.columns = [prefix + col if col != 'ID' else col for col in data.columns]

        merged_data = pd.merge(merged_data, data, on='ID', how='outer')

Найдено 16 CSV файлов в папке.


In [2]:
df = merged_data.drop('ID', axis=1)
df.head(10)

Unnamed: 0.1,Unnamed: 0,Number_male,Weight_of_progeny,Sex_progeny,File_10_Unnamed: 0,File_10_Number_male,File_10_Weight_of_progeny,File_10_Sex_progeny,File_12_Unnamed: 0,File_12_Class,...,File_3_Weight_in_plus1,File_1_Unnamed: 0,File_1_Date_of_birth,File_1_Weight_on_buy,File_1_Weight_on_3.01.17,File_1_Weight_in_plus,File_1_Weight_on_4.01.18,File_1_Weight_on_31.12.18,File_1_Lactation,File_1_Combined_Column
0,0.0,973,2000.0,м,,,,,,,...,,,,,,,,,,
1,0.0,973,2000.0,м,,,,,,,...,,,,,,,,,,
2,0.0,973,2000.0,м,,,,,,,...,,,,,,,,,,
3,0.0,973,2000.0,м,,,,,,,...,,,,,,,,,,
4,2.0,973,2950.0,ж,,,,,,,...,,,,,,,,,,
5,3.0,973,1900.0,м,,,,,,,...,,,,,,,,,,
6,4.0,973,2890.178571,ж,,,,,,,...,,,,,,,,,,
7,4.0,973,2890.178571,ж,,,,,,,...,,,,,,,,,,
8,4.0,973,2890.178571,ж,,,,,,,...,,,,,,,,,,
9,4.0,973,2890.178571,ж,,,,,,,...,,,,,,,,,,


In [282]:
merged_data = merged_data.loc[:, ~merged_data.columns.str.contains('Unnamed')]

In [283]:
target_columns = merged_data.filter(like='Target').columns
target_columns

Index(['Target_Target'], dtype='object')

In [284]:
# Находим 25 и 75 перцентили
percentile_25 = merged_data['Target_Target'].quantile(0.25)
percentile_75 = merged_data['Target_Target'].quantile(0.75)

print(f"25 перцентиль: {percentile_25}")
print(f"75 перцентиль: {percentile_75}")

25 перцентиль: 2.1041851510844047
75 перцентиль: 3.16363032289184


In [285]:
# Функция для генерации рандомного значения в заданном диапазоне с шагом 0.01
def generate_random_value(start, stop, step=0.01):
    return round(random.uniform(start, stop) // step * step, 2)

# Заполнение пропусков рандомными значениями
for index, value in merged_data['Target_Target'].items():
    if pd.isnull(value):
        merged_data.at[index, 'Target_Target'] = generate_random_value(percentile_25, percentile_75)

In [286]:
merged_data.Target_Target

0       2.41
1       2.87
2       2.91
3       2.26
4       3.07
        ... 
4675    2.12
4676    2.76
4677    2.64
4678    2.23
4679    2.67
Name: Target_Target, Length: 4680, dtype: float64

In [287]:
missing_threshold = 0.8 * len(merged_data)
columns_to_drop = merged_data.columns[merged_data.isna().sum() > missing_threshold]

# Удаление найденных колонок из датафрейма
merged_data = merged_data.drop(columns=columns_to_drop)

In [289]:
len(merged_data.ID.unique())

768

In [290]:
# Применение агрегаций к столбцам типа float64 и object
data = merged_data.drop('ID', axis=1)
aggregations = {
    col: 'mean' for col in data.select_dtypes(include=['float64']).columns
}

aggregations.update({
    col: lambda x: ','.join(map(str, x.unique())) for col in data.select_dtypes(include=['object']).columns
})

df = merged_data.groupby('ID').agg(aggregations).reset_index()

In [292]:
df.isna().sum()

ID                             0
File_10_Weight_of_progeny    316
File_12_Fat                  600
File_12_Protein              600
File_12_Lact_count           600
File_12_Total_yield          600
File_12_Count_of_month       600
File_13_Fat                  565
File_13_Protein              565
File_13_Lact_count           565
File_13_Total_yield          565
File_13_Count_of_month       565
File_8_Weight_of_progeny     658
File_14_Fat                  565
File_14_Protein              565
File_14_Lact_count           565
File_14_Total_yield          565
File_14_Count_of_month       565
File_15_Fat                  630
File_15_Protein              630
File_15_Lact_count           630
File_15_Total_yield          630
File_15_Count_of_month       630
File_9_Weight_of_progeny     658
File_4_Weight_of_progeny     424
File_5_Weight_of_progeny     613
File_7_Weight_of_progeny     471
File_2_Weight                518
File_2_Weight_on_31.12.18    518
Target_Target                  0
File_3_Wei

In [293]:
# 1. Найдем все колонки с корнем "Weight_of_progeny"
weight_columns = [col for col in df.columns if "Weight_of_progeny" in col]

# 2. Найдем минимальное и максимальное значение для каждой колонки
min_values = {col: df[col].min() for col in weight_columns}
max_values = {col: df[col].max() for col in weight_columns}

# 3. Заполним пропуски в каждой колонке
for col in weight_columns:
    # Генерируем случайные значения с шагом 0.1 в диапазоне от минимального до максимального
    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    # Заполняем пропуски случайными значениями из этого диапазона
    df[col].fillna(np.random.choice(random_values), inplace=True)

# 4. Создадим новую колонку "Weight_of_progeny_final"
df['Weight_of_progeny_final'] = df[weight_columns].max(axis=1)


In [294]:
df.Weight_of_progeny_final

0      3.05
1      3.05
2      3.05
3      3.05
4      3.05
       ... 
763    3.05
764    3.20
765    3.20
766    3.20
767    3.05
Name: Weight_of_progeny_final, Length: 768, dtype: float64

In [295]:
# Удаляем исходные колонки
df.drop(columns=weight_columns, inplace=True)

In [297]:
fat_columns = [col for col in df.columns if "Fat" in col]

min_values = {col: df[col].min() for col in fat_columns}
max_values = {col: df[col].max() for col in fat_columns}

for col in fat_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Fat_final'] = df[fat_columns].max(axis=1)

In [298]:
df.drop(columns=fat_columns, inplace=True)

In [300]:
protein_columns = [col for col in df.columns if "Protein" in col]

min_values = {col: df[col].min() for col in protein_columns}
max_values = {col: df[col].max() for col in protein_columns}

for col in protein_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Protein_final'] = df[protein_columns].max(axis=1)

In [301]:
df.drop(columns=protein_columns, inplace=True)

In [303]:
lact_columns = [col for col in df.columns if "Lact_count" in col]

min_values = {col: df[col].min() for col in lact_columns}
max_values = {col: df[col].max() for col in lact_columns}

for col in lact_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Lact_final'] = df[lact_columns].max(axis=1)

In [304]:
df.drop(columns=lact_columns, inplace=True)

In [306]:
yield_columns = [col for col in df.columns if "Total_yield" in col]

min_values = {col: df[col].min() for col in yield_columns}
max_values = {col: df[col].max() for col in yield_columns}

for col in yield_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Yield_final'] = df[yield_columns].max(axis=1)

In [307]:
df.drop(columns=yield_columns, inplace=True)

In [309]:
month_columns = [col for col in df.columns if "Count_of_month" in col]

min_values = {col: df[col].min() for col in month_columns}
max_values = {col: df[col].max() for col in month_columns}

for col in month_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Count_month_final'] = df[month_columns].max(axis=1)

In [310]:
df.drop(columns=month_columns, inplace=True)

In [312]:
weight_on_columns = [col for col in df.columns if "Weight_on" in col]

min_values = {col: df[col].min() for col in weight_on_columns}
max_values = {col: df[col].max() for col in weight_on_columns}

for col in weight_on_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Weight_on_final'] = df[weight_on_columns].max(axis=1)

In [313]:
df.drop(columns=weight_on_columns, inplace=True)

In [315]:
weight_in_columns = [col for col in df.columns if "Weight_in" in col]

min_values = {col: df[col].min() for col in weight_in_columns}
max_values = {col: df[col].max() for col in weight_in_columns}

for col in weight_in_columns:

    random_values = np.arange(min_values[col], max_values[col])

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Weight_in_final'] = df[weight_in_columns].max(axis=1)

df.drop(columns=weight_in_columns, inplace=True)

In [317]:
lactation_in_columns = [col for col in df.columns if "Lactation" in col]

min_values = {col: df[col].min() for col in lactation_in_columns}
max_values = {col: df[col].max() for col in lactation_in_columns}

for col in lactation_in_columns:

    random_values = np.arange(min_values[col], max_values[col] + 0.1, 0.1)

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Lactation_final'] = df[lactation_in_columns].max(axis=1)

df.drop(columns=lactation_in_columns, inplace=True)

In [318]:
df.head()

Unnamed: 0,ID,File_2_Weight,Target_Target,File_1_Combined_Column,File_10_Number_male,File_10_Sex_progeny,File_12_Class,File_13_Class,File_8_Number_male,File_8_Sex_progeny,...,File_1_Date_of_birth,Weight_of_progeny_final,Fat_final,Protein_final,Lact_final,Yield_final,Count_month_final,Weight_on_final,Weight_in_final,Lactation_final
0,1,,2.83,,75,м,,,,,...,,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3
1,2,,2.363333,,75,м,,,,,...,,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3
2,3,,2.643333,,75,м,,,,,...,,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3
3,7,,2.655,,К925,"м,ж",,,,,...,,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3
4,19,,2.34,,75,м,,,,,...,,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3


In [319]:
combined_columns = [col for col in df.columns if "Combined_Column" in col]

min_values = {col: df[col].min() for col in combined_columns}
max_values = {col: df[col].max() for col in combined_columns}

for col in combined_columns:

    random_values = np.arange(min_values[col], max_values[col])

    df[col].fillna(np.random.choice(random_values), inplace=True)

df['Combined_final'] = df[combined_columns].max(axis=1)

df.drop(columns=combined_columns, inplace=True)

In [320]:
df.head()

Unnamed: 0,ID,File_2_Weight,Target_Target,File_10_Number_male,File_10_Sex_progeny,File_12_Class,File_13_Class,File_8_Number_male,File_8_Sex_progeny,File_14_Class,...,Weight_of_progeny_final,Fat_final,Protein_final,Lact_final,Yield_final,Count_month_final,Weight_on_final,Weight_in_final,Lactation_final,Combined_final
0,1,,2.83,75,м,,,,,,...,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.0
1,2,,2.363333,75,м,,,,,,...,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.0
2,3,,2.643333,75,м,,,,,,...,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.0
3,7,,2.655,К925,"м,ж",,,,,,...,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.0
4,19,,2.34,75,м,,,,,,...,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.0


In [321]:
# Создаем список колонок, содержащих "Number_male"
cols_to_combine = [col for col in df.columns if 'Number_male' in col]

# Заполним пропуски самыми популярными значениями для каждой колонки
most_common_values = df[cols_to_combine].mode().iloc[0]
for col in cols_to_combine:
    df[col].fillna(most_common_values[col], inplace=True)

df['Number_male_final'] = df[cols_to_combine[0]]

for col in cols_to_combine[1:]:
    df['Number_male_final'].fillna(df[col], inplace=True)

df.drop(columns=cols_to_combine, inplace=True)

In [322]:
df.Number_male_final

0            75
1            75
2            75
3          К925
4            75
         ...   
763        К914
764         nan
765         nan
766         nan
767    нач сент
Name: Number_male_final, Length: 768, dtype: object

In [324]:
progeny_columns = [col for col in df.columns if 'Sex_progeny' in col]

most_common_values = df[progeny_columns].mode().iloc[0]
for col in progeny_columns:
    df[col].fillna(most_common_values[col], inplace=True)

df['Progeny_final'] = df[progeny_columns[0]]

for col in progeny_columns[1:]:
    df['Progeny_final'].fillna(df[col], inplace=True)

df.drop(columns=progeny_columns, inplace=True)

In [326]:
# Найдем все колонки, содержащие 'Class'
class_columns = [col for col in df.columns if 'Class' in col]

# Заполним пропуски самыми популярными значениями для каждой колонки
most_common_values = df[class_columns].mode().iloc[0]
for col in class_columns:
    df[col].fillna(most_common_values[col], inplace=True)

# Создаем новую колонку 'Class_final' и заполняем ее значениями из первой колонки из class_columns
df['Class_final'] = df[class_columns[0]]

# Добавляем значения из остальных колонок
for col in class_columns[1:]:
    df['Class_final'].fillna(df[col], inplace=True)

# Удаляем исходные колонки с 'Class'
df.drop(columns=class_columns, inplace=True)

In [328]:
birth_columns = [col for col in df.columns if 'Date_of_birth' in col]

In [329]:
df[birth_columns]

Unnamed: 0,File_2_Date_of_birth,File_1_Date_of_birth
0,,
1,,
2,,
3,,
4,,
...,...,...
763,,
764,2017-02-27,
765,2017-02-27,
766,,2015-09-02 00:00:00.000000000


In [330]:
df.File_2_Date_of_birth.unique()

array(['nan', '2017-02-27', '2015-02-05,2015-04-17,2015-05-01',
       '2015-04-20,2015-04-29', '2015-05-05,2015-06-11',
       '2015-02-25,2015-05-06', '2015-04-22', '2015-03-22', '2015-02-18',
       '2015-04-23', '2015-06-13', '2015-05-10', '2015-05-07',
       '2015-09-19', '2015-02-22,2017-02-27', '2015-05-13',
       '2015-03-04,2015-06-16',
       '2014-12-22,2015-05-01,2015-05-03,2015-10-01', '2015-05-22',
       '2015-02-19', '2015-03-02,2017-02-27', '2015-02-03,2015-02-27',
       '2015-05-06,2017-02-27', '2015-03-03,2015-04-26',
       '2015-02-03,2015-02-20,2017-02-27', '2015-09-23',
       '2015-03-21,2015-03-12', '2015-02-06,2017-02-27', '2015-04-15',
       '2015-02-18,2015-03-21,2015-10-04', '2015-02-20,2015-03-05',
       '2015-02-25', '2015-03-13,2015-06-08,2015-09-15',
       '2015-03-10,2015-06-15', '2015-03-20',
       '2015-05-16,2015-09-02,2015-10-07', '2015-02-10,2015-02-27',
       '2015-05-03,2017-02-27', '2015-03-04,2015-10-21', '2015-04-10',
       '2015-10-

In [331]:
df = df.drop(birth_columns, axis=1)
df = df.drop('File_2_Weight', axis=1)
df = df.drop('ID', axis=1)

In [332]:
df

Unnamed: 0,Target_Target,Weight_of_progeny_final,Fat_final,Protein_final,Lact_final,Yield_final,Count_month_final,Weight_on_final,Weight_in_final,Lactation_final,Combined_final,Number_male_final,Progeny_final,Class_final
0,2.830000,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.000000,75,м,
1,2.363333,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.000000,75,м,
2,2.643333,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.000000,75,м,
3,2.655000,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.000000,К925,"м,ж",
4,2.340000,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.000000,75,м,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,2.580000,3.05,3.8,3.3,3.1,3093.6,11.7,45.0,22.2,2.3,12.000000,К914,м,
764,2.920000,3.20,3.8,3.3,3.1,3093.6,11.7,60.0,22.2,2.3,12.000000,,,
765,2.870000,3.20,3.8,3.3,3.1,3093.6,11.7,60.0,22.2,2.3,12.000000,,,
766,2.670000,3.20,3.8,3.3,3.1,3093.6,11.7,59.3,22.2,2.0,30.829064,,,


In [333]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Target_Target            768 non-null    float64
 1   Weight_of_progeny_final  768 non-null    float64
 2   Fat_final                768 non-null    float64
 3   Protein_final            768 non-null    float64
 4   Lact_final               768 non-null    float64
 5   Yield_final              768 non-null    float64
 6   Count_month_final        768 non-null    float64
 7   Weight_on_final          768 non-null    float64
 8   Weight_in_final          768 non-null    float64
 9   Lactation_final          768 non-null    float64
 10  Combined_final           768 non-null    float64
 11  Number_male_final        768 non-null    object 
 12  Progeny_final            768 non-null    object 
 13  Class_final              768 non-null    object 
dtypes: float64(11), object(3)


In [334]:
X = df.drop('Target_Target', axis=1)
y = df['Target_Target']

# Обработка категориальных признаков
categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Стандартизация числовых признаков
numerical_cols = X.select_dtypes(include=['float64']).columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение модели линейной регрессии
model = LinearRegression()
model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred = model.predict(X_test)

# Оценка качества модели (RMSE и MAE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

Root Mean Squared Error (RMSE): 12345073.436580572
Mean Absolute Error (MAE): 1192951.330950536


In [335]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred = rf.predict(X_test)

# Оценка качества модели (RMSE и MAE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

Root Mean Squared Error (RMSE): 0.3455640129087661
Mean Absolute Error (MAE): 0.22786558321955042


In [336]:
predictions = pd.DataFrame({
    'Real_Value': y_test,
    'Predicted_Value': y_pred
})
predictions

Unnamed: 0,Real_Value,Predicted_Value
668,2.428889,2.415743
324,2.220000,2.459227
624,2.715667,2.548331
690,2.670000,2.578052
473,2.430000,2.428636
...,...,...
355,2.520000,2.620671
534,3.110000,2.621746
344,2.506667,2.620671
296,2.520000,2.674716


In [337]:
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

y_pred = svr_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

Root Mean Squared Error (RMSE): 0.35518579161756547
Mean Absolute Error (MAE): 0.23634695254165702


In [338]:
catboost_model = CatBoostRegressor()
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = catboost_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

Learning rate set to 0.047167
0:	learn: 0.3981672	test: 0.4170169	best: 0.4170169 (0)	total: 1.08ms	remaining: 1.08s
1:	learn: 0.3923528	test: 0.4124919	best: 0.4124919 (1)	total: 1.49ms	remaining: 744ms
2:	learn: 0.3873129	test: 0.4080969	best: 0.4080969 (2)	total: 1.86ms	remaining: 617ms
3:	learn: 0.3804474	test: 0.4039477	best: 0.4039477 (3)	total: 2.39ms	remaining: 595ms
4:	learn: 0.3770430	test: 0.4019249	best: 0.4019249 (4)	total: 2.93ms	remaining: 583ms
5:	learn: 0.3716350	test: 0.3996748	best: 0.3996748 (5)	total: 3.47ms	remaining: 574ms
6:	learn: 0.3672644	test: 0.3978666	best: 0.3978666 (6)	total: 4.07ms	remaining: 577ms
7:	learn: 0.3644064	test: 0.3958621	best: 0.3958621 (7)	total: 4.58ms	remaining: 568ms
8:	learn: 0.3610224	test: 0.3937949	best: 0.3937949 (8)	total: 5.14ms	remaining: 566ms
9:	learn: 0.3572060	test: 0.3906463	best: 0.3906463 (9)	total: 5.64ms	remaining: 558ms
10:	learn: 0.3531181	test: 0.3883355	best: 0.3883355 (10)	total: 6.11ms	remaining: 549ms
11:	learn: 

In [350]:
param_grid = {
    'n_estimators': randint(100, 1000),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None] + list(randint(3, 30).rvs(10)),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11),
    'bootstrap': [True, False]
}

# Создание объекта RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100,
                                   cv=3, verbose=2, random_state=42, n_jobs=-1)

# Выполнение поиска по сетке
random_search.fit(X, y)

# Вывод лучших гиперпараметров
print("Лучшие гиперпараметры:", random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


102 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
88 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.

Лучшие гиперпараметры: {'bootstrap': True, 'max_depth': 28, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 476}


In [351]:
rf = RandomForestRegressor(n_estimators=476, min_samples_leaf=1, min_samples_split=2, max_depth=28, bootstrap=False, max_features='sqrt')
rf.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred = rf.predict(X_test)

# Оценка качества модели (RMSE и MAE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

Root Mean Squared Error (RMSE): 0.35205101564842384
Mean Absolute Error (MAE): 0.22761180916592055


In [341]:
# Задаем модель CatBoost
catboost = CatBoostRegressor(verbose=0)

# Сетка гиперпараметров
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'depth': sp_randint(3, 10),
    'l2_leaf_reg': sp_randint(1, 10),
    'iterations': sp_randint(50, 1000),
    'border_count': [32, 64, 128],
    'bagging_temperature': [0, 1, 2, 3, 4],
    'random_strength': [0.1, 0.5, 1, 2, 3],
    'bootstrap_type': ['Bayesian', 'Bernoulli'],
    'boosting_type': ['Ordered', 'Plain'],
    'max_ctr_complexity': [1, 2, 3],
    'has_time': [False, True],
    'one_hot_max_size': [2, 5, 10, 20],
    'min_child_samples': [1, 2, 3, 4],
    'model_shrink_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'feature_border_type': ['GreedyLogSum', 'Median', 'Uniform', 'MaxLogSum']
}

# Инициализация RandomizedSearchCV
random_search = RandomizedSearchCV(catboost, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

random_search.fit(X, y)

# Лучшие параметры
print("Лучшие гиперпараметры:", random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=11, max_features=auto, min_samples_leaf=8, min_samples_split=6, n_estimators=714; total time=   0.0s
[CV] END bootstrap=False, max_depth=19, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=559; total time=   0.2s
[CV] END bootstrap=True, max_depth=29, max_features=log2, min_samples_leaf=3, min_samples_split=8, n_estimators=343; total time=   0.2s
[CV] END bootstrap=False, max_depth=14, max_features=sqrt, min_samples_leaf=9, min_samples_split=6, n_estimators=997; total time=   0.4s
[CV] END bootstrap=True, max_depth=26, max_features=sqrt, min_samples_leaf=7, min_samples_split=10, n_estimators=801; total time=   0.4s
[CV] END bootstrap=False, max_depth=26, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=838; total time=   0.3s
[CV] END bootstrap=False, max_depth=11, max_features=sqrt, min_samples_leaf=10, min_samples_split=8, n_estimators=447

141 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
129 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.8/site-packages/catboost/core.py", line 5734, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "/Users/andreypronin/anaconda3/envs/GoatsFarm/lib/python3.8/site-packages/catboost/core.py", line 2341, in _fit
    train_param

[CV] END bagging_temperature=2, boosting_type=Plain, bootstrap_type=Bayesian, border_count=64, depth=3, feature_border_type=GreedyLogSum, has_time=False, iterations=460, l2_leaf_reg=7, learning_rate=0.01, max_ctr_complexity=2, min_child_samples=4, model_shrink_rate=0.05, one_hot_max_size=5, random_strength=3; total time=   0.3s
[CV] END bagging_temperature=2, boosting_type=Plain, bootstrap_type=Bayesian, border_count=64, depth=3, feature_border_type=GreedyLogSum, has_time=False, iterations=460, l2_leaf_reg=7, learning_rate=0.01, max_ctr_complexity=2, min_child_samples=4, model_shrink_rate=0.05, one_hot_max_size=5, random_strength=3; total time=   0.3s
[CV] END bagging_temperature=2, boosting_type=Plain, bootstrap_type=Bayesian, border_count=64, depth=3, feature_border_type=GreedyLogSum, has_time=False, iterations=460, l2_leaf_reg=7, learning_rate=0.01, max_ctr_complexity=2, min_child_samples=4, model_shrink_rate=0.05, one_hot_max_size=5, random_strength=3; total time=   0.2s
[CV] END b

In [342]:
catboost_model = CatBoostRegressor(bagging_temperature=2, boosting_type='Ordered', bootstrap_type='Bayesian',
                                   border_count=64, depth=9, feature_border_type='MaxLogSum', has_time=False,
                                   iterations=519, l2_leaf_reg=2, learning_rate=0.01, max_ctr_complexity=3,
                                   min_child_samples=2, model_shrink_rate=0.01, one_hot_max_size=2, random_strength=2)
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = catboost_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")

0:	learn: 0.4016885	test: 0.4197203	best: 0.4197203 (0)	total: 2.46ms	remaining: 1.27s
1:	learn: 0.4007478	test: 0.4189093	best: 0.4189093 (1)	total: 3.71ms	remaining: 958ms
2:	learn: 0.3995857	test: 0.4182341	best: 0.4182341 (2)	total: 5.36ms	remaining: 923ms
3:	learn: 0.3985746	test: 0.4173807	best: 0.4173807 (3)	total: 6.08ms	remaining: 783ms
4:	learn: 0.3975203	test: 0.4168539	best: 0.4168539 (4)	total: 8.67ms	remaining: 892ms
5:	learn: 0.3965042	test: 0.4160080	best: 0.4160080 (5)	total: 9.31ms	remaining: 796ms
6:	learn: 0.3953123	test: 0.4151826	best: 0.4151826 (6)	total: 11.9ms	remaining: 867ms
7:	learn: 0.3939263	test: 0.4142273	best: 0.4142273 (7)	total: 14.4ms	remaining: 922ms
8:	learn: 0.3929032	test: 0.4134086	best: 0.4134086 (8)	total: 17.7ms	remaining: 1s
9:	learn: 0.3914618	test: 0.4123131	best: 0.4123131 (9)	total: 27.2ms	remaining: 1.39s
10:	learn: 0.3901709	test: 0.4114967	best: 0.4114967 (10)	total: 30.4ms	remaining: 1.4s
11:	learn: 0.3892305	test: 0.4108217	best: 0.

In [349]:
catboost_model = CatBoostRegressor(bagging_temperature=2, boosting_type='Ordered', bootstrap_type='Bayesian',
                                   border_count=64, depth=9, feature_border_type='MaxLogSum', has_time=False,
                                   iterations=519, l2_leaf_reg=2, learning_rate=0.01, max_ctr_complexity=3,
                                   min_child_samples=2, model_shrink_rate=0.01, one_hot_max_size=2, random_strength=2)
catboost_model.fit(X, y)

joblib.dump(model, 'CatBoostModel.pkl')

0:	learn: 0.4049152	total: 3.24ms	remaining: 1.68s
1:	learn: 0.4039844	total: 6.62ms	remaining: 1.71s
2:	learn: 0.4028644	total: 7.57ms	remaining: 1.3s
3:	learn: 0.4018486	total: 10.7ms	remaining: 1.38s
4:	learn: 0.4005763	total: 13.9ms	remaining: 1.43s
5:	learn: 0.3993038	total: 16.6ms	remaining: 1.42s
6:	learn: 0.3977994	total: 20.1ms	remaining: 1.47s
7:	learn: 0.3967420	total: 23ms	remaining: 1.47s
8:	learn: 0.3953337	total: 25.9ms	remaining: 1.47s
9:	learn: 0.3938237	total: 28.8ms	remaining: 1.47s
10:	learn: 0.3929163	total: 31.3ms	remaining: 1.45s
11:	learn: 0.3918491	total: 34ms	remaining: 1.44s
12:	learn: 0.3906271	total: 36.6ms	remaining: 1.43s
13:	learn: 0.3896939	total: 39.1ms	remaining: 1.41s
14:	learn: 0.3887879	total: 41.7ms	remaining: 1.4s
15:	learn: 0.3878766	total: 44.5ms	remaining: 1.4s
16:	learn: 0.3867078	total: 45.8ms	remaining: 1.35s
17:	learn: 0.3857900	total: 48.4ms	remaining: 1.35s
18:	learn: 0.3846788	total: 51.2ms	remaining: 1.35s
19:	learn: 0.3841291	total: 5

['CatBoostModel.pkl']