In [9]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [10]:
import os
os.chdir('/content/drive/My Drive/')  # Укажите путь к папке, где находится ваш CSV-файл

In [11]:
!pip install catboost -q
!pip install optuna -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Accidents with hazardous/toxic substances emission

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
import pandas as pd
df = pd.read_csv("for_learning.csv")

# Список целевых меток, которые вы хотите удалить
target_columns = ['accidents with hazardous/toxic substances emission'] #замените на свое

bad_features = ['natural hazards', 'housing and utilities', 'accidents in transport',
                 'explosions/fires/damages', 'other hazards']

train_features = ['district','T', 'Po', 'P', 'Pa', 'U', 'Ff', 'ff10', 'ff3', 'N',
                  'Tn', 'Tx', 'Nh', 'VV', 'Td', 'RRR', 'tR', 'Tg', 'sss', 'Spring',
                  'Winter', 'Summer', 'Autumn', 'year', 'month', 'day', 'heating network',
                  'cold water network', 'hot water network', 'electricity network',
                  'gas network', 'water treatment stations', 'water pumping stations',
                  'water intake facilities', 'sewage network', 'sewage treatment plants',
                  'sewage pumping stations', 'boiler houses', 'heat points']


train = df.drop(bad_features, axis=1)

x = train.drop(target_columns, axis=1)
y = train.drop(train_features, axis=1)

class_counts = y.value_counts()

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
from catboost import CatBoostRegressor, Pool, CatBoostClassifier

cat_features =['district']

train_pool = Pool(
    xtrain,
    ytrain,
    cat_features=cat_features

)
valid_pool = Pool(
    xtest,
    ytest,
    cat_features=cat_features
)

In [19]:
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score

In [None]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 8),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        #od_wait=trial.suggest_int("od_wait", 10, 50),
        auto_class_weights =trial.suggest_categorical('auto_class_weights',['SqrtBalanced', 'Balanced']),
        verbose=False,

        eval_metric = 'TotalF1:use_weights=false;average=Macro',
        task_type="GPU",
        early_stopping_rounds = 100,
        use_best_model = True,
    )
    model.fit(train_pool, eval_set=valid_pool)
    y_pred = model.predict(xtest)
    return f1_score(ytest, y_pred, average='macro')

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
sampler = TPESampler(seed=1)

In [None]:
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)

In [None]:
study.optimize(objective, n_trials=50, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
print("Количество завершенных испытаний: ", len(study.trials))
print("Лучшее испытание:")
trial = study.best_trial
print("  Значение: ", trial.value)
print("  Параметры: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Количество завершенных испытаний:  50
Лучшее испытание:
  Значение:  0.5159258123850399
  Параметры: 
    iterations: 230
    learning_rate: 0.08265237900953774
    depth: 8
    l2_leaf_reg: 13.441320205873179
    bootstrap_type: Bayesian
    random_strength: 0.10723069953941497
    bagging_temperature: 0.2666614136302563
    od_type: IncToDec
    auto_class_weights: Balanced


In [27]:
catboost_params = {
    'learning_rate':  0.08265237900953774,
    'depth': 8,
    'l2_leaf_reg': 13.441320205873179,
    'bootstrap_type': 'Bayesian',
    'random_strength': 0.10723069953941497,
    'bagging_temperature': 0.2666614136302563,
    'od_type': 'IncToDec',
    'task_type': 'GPU',
}

model = CatBoostRegressor(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.2213463	test: 0.2215832	best: 0.2215832 (0)	total: 232ms	remaining: 3m 51s
1:	learn: 0.2194922	test: 0.2199588	best: 0.2199588 (1)	total: 449ms	remaining: 3m 44s
2:	learn: 0.2177603	test: 0.2184497	best: 0.2184497 (2)	total: 666ms	remaining: 3m 41s
3:	learn: 0.2163565	test: 0.2172544	best: 0.2172544 (3)	total: 887ms	remaining: 3m 40s
4:	learn: 0.2150734	test: 0.2163268	best: 0.2163268 (4)	total: 1.05s	remaining: 3m 29s
5:	learn: 0.2139691	test: 0.2154443	best: 0.2154443 (5)	total: 1.13s	remaining: 3m 7s
6:	learn: 0.2129984	test: 0.2146544	best: 0.2146544 (6)	total: 1.2s	remaining: 2m 50s
7:	learn: 0.2120994	test: 0.2139235	best: 0.2139235 (7)	total: 1.27s	remaining: 2m 37s
8:	learn: 0.2113044	test: 0.2132926	best: 0.2132926 (8)	total: 1.31s	remaining: 2m 24s
9:	learn: 0.2106614	test: 0.2127882	best: 0.2127882 (9)	total: 1.36s	remaining: 2m 15s
10:	learn: 0.2099415	test: 0.2123233	best: 0.2123233 (10)	total: 1.41s	remaining: 2m 7s
11:	learn: 0.2094415	test: 0.2119791	best: 0

<catboost.core.CatBoostRegressor at 0x7cf6445c95d0>

In [28]:
import pickle

pickle.dump(model, open('accidents_with_hazardous_toxic substances emission.pkl', 'wb'))

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from catboost import CatBoostClassifier, Pool

n_splits = 3
clfs = []
scores = []

catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'depth': 6,
    'l2_leaf_reg': 3,
    'random_state': 42,
    'eval_metric': 'TotalF1:average=Macro',
    'verbose': False
}

kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=7575)
for train_index, test_index in kf.split(x,y):

    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    cat_features =['district']

    train_dataset = Pool(
        X_train,
        y_train,
        cat_features=cat_features
    )
    eval_dataset = Pool(
        X_test,
        y_test,
        cat_features=cat_features
    )

    clf = CatBoostClassifier(**catboost_params)

    clfs.append(clf)

    clf.fit(
        train_dataset,
        eval_set=eval_dataset,
        verbose=500,
        use_best_model=True,
        plot=False)

    scores.append(np.mean([v for k, v in clf.best_score_["validation"].items() if "TotalF1:average=Macro" in k], dtype="float16"))

assert len(clfs) == n_splits
print("mean TotalF1:use_weights=false;average=Macro score --------->", np.mean(scores, dtype="float16") - np.std(scores, dtype="float16"))

# Housing and utilites

In [16]:
import pandas as pd
df = pd.read_csv("for_learning.csv")

# Список целевых меток, которые вы хотите удалить
target_columns = ['housing and utilities'] #замените на свое

bad_features = ['natural hazards', 'accidents with hazardous/toxic substances emission', 'accidents in transport',
                 'explosions/fires/damages', 'other hazards']

train_features = ['district','T', 'Po', 'P', 'Pa', 'U', 'Ff', 'ff10', 'ff3', 'N',
                  'Tn', 'Tx', 'Nh', 'VV', 'Td', 'RRR', 'tR', 'Tg', 'sss', 'Spring',
                  'Winter', 'Summer', 'Autumn', 'year', 'month', 'day', 'heating network',
                  'cold water network', 'hot water network', 'electricity network',
                  'gas network', 'water treatment stations', 'water pumping stations',
                  'water intake facilities', 'sewage network', 'sewage treatment plants',
                  'sewage pumping stations', 'boiler houses', 'heat points']


train = df.drop(bad_features, axis=1)

x = train.drop(target_columns, axis=1)
y = train.drop(train_features, axis=1)

class_counts = y.value_counts()

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 8),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        #od_wait=trial.suggest_int("od_wait", 10, 50),
        auto_class_weights =trial.suggest_categorical('auto_class_weights',['SqrtBalanced', 'Balanced']),
        verbose=False,

        eval_metric = 'TotalF1:use_weights=false;average=Macro',
        task_type="GPU",
        early_stopping_rounds = 100,
        use_best_model = True,
    )
    model.fit(train_pool, eval_set=valid_pool)
    y_pred = model.predict(xtest)
    return f1_score(ytest, y_pred, average='macro')

In [21]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [22]:
sampler = TPESampler(seed=1)

In [23]:
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)

In [24]:
study.optimize(objective, n_trials=50, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [25]:
print("Количество завершенных испытаний: ", len(study.trials))
print("Лучшее испытание:")
trial = study.best_trial
print("  Значение: ", trial.value)
print("  Параметры: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Количество завершенных испытаний:  50
Лучшее испытание:
  Значение:  0.34541604666503944
  Параметры: 
    iterations: 700
    learning_rate: 0.09966258158429915
    depth: 7
    l2_leaf_reg: 0.007503090399418994
    bootstrap_type: Bayesian
    random_strength: 0.00043995594230790923
    bagging_temperature: 7.63421070914145
    od_type: IncToDec
    auto_class_weights: Balanced


In [29]:
catboost_params = {
    'learning_rate':  0.09966258158429915,
    'depth': 7,
    'l2_leaf_reg': 0.007503090399418994,
    'bootstrap_type': 'Bayesian',
    'random_strength': 0.00043995594230790923,
    'bagging_temperature': 7.63421070914145,
    'od_type': 'IncToDec',
    'task_type': 'GPU',
}

model = CatBoostRegressor(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.2232037	test: 0.2233558	best: 0.2233558 (0)	total: 52ms	remaining: 52s
1:	learn: 0.2219002	test: 0.2223527	best: 0.2223527 (1)	total: 103ms	remaining: 51.2s
2:	learn: 0.2216528	test: 0.2222622	best: 0.2222622 (2)	total: 150ms	remaining: 49.9s
3:	learn: 0.2212942	test: 0.2219497	best: 0.2219497 (3)	total: 199ms	remaining: 49.6s
4:	learn: 0.2211451	test: 0.2218756	best: 0.2218756 (4)	total: 238ms	remaining: 47.4s
5:	learn: 0.2210614	test: 0.2218200	best: 0.2218200 (5)	total: 282ms	remaining: 46.7s
6:	learn: 0.2208259	test: 0.2216452	best: 0.2216452 (6)	total: 314ms	remaining: 44.5s
7:	learn: 0.2204032	test: 0.2214408	best: 0.2214408 (7)	total: 346ms	remaining: 42.9s
8:	learn: 0.2202830	test: 0.2213857	best: 0.2213857 (8)	total: 377ms	remaining: 41.5s
9:	learn: 0.2201212	test: 0.2212758	best: 0.2212758 (9)	total: 412ms	remaining: 40.8s
10:	learn: 0.2197734	test: 0.2209543	best: 0.2209543 (10)	total: 440ms	remaining: 39.6s
11:	learn: 0.2195584	test: 0.2208885	best: 0.2208885 (1

<catboost.core.CatBoostRegressor at 0x7cf6445ff580>

In [30]:
import pickle

pickle.dump(model, open('Housing_utilities.pkl', 'wb'))

# Other hazards

In [39]:
import pandas as pd
df = pd.read_csv("for_learning.csv")

# Список целевых меток, которые вы хотите удалить
target_columns = ['other hazards'] #замените на свое

bad_features = ['natural hazards', 'accidents with hazardous/toxic substances emission', 'accidents in transport',
                 'explosions/fires/damages', 'housing and utilities']

train_features = ['district','T', 'Po', 'P', 'Pa', 'U', 'Ff', 'ff10', 'ff3', 'N',
                  'Tn', 'Tx', 'Nh', 'VV', 'Td', 'RRR', 'tR', 'Tg', 'sss', 'Spring',
                  'Winter', 'Summer', 'Autumn', 'year', 'month', 'day', 'heating network',
                  'cold water network', 'hot water network', 'electricity network',
                  'gas network', 'water treatment stations', 'water pumping stations',
                  'water intake facilities', 'sewage network', 'sewage treatment plants',
                  'sewage pumping stations', 'boiler houses', 'heat points']


train = df.drop(bad_features, axis=1)

x = train.drop(target_columns, axis=1)
y = train.drop(train_features, axis=1)

class_counts = y.value_counts()

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [40]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 8),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        #od_wait=trial.suggest_int("od_wait", 10, 50),
        auto_class_weights =trial.suggest_categorical('auto_class_weights',['SqrtBalanced', 'Balanced']),
        verbose=False,

        eval_metric = 'TotalF1:use_weights=false;average=Macro',
        task_type="GPU",
        early_stopping_rounds = 100,
        use_best_model = True,
    )
    model.fit(train_pool, eval_set=valid_pool)
    y_pred = model.predict(xtest)
    return f1_score(ytest, y_pred, average='macro')

In [41]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [42]:
sampler = TPESampler(seed=1)

In [43]:
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)

In [44]:
study.optimize(objective, n_trials=50, show_progress_bar=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [45]:
print("Количество завершенных испытаний: ", len(study.trials))
print("Лучшее испытание:")
trial = study.best_trial
print("  Значение: ", trial.value)
print("  Параметры: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Количество завершенных испытаний:  50
Лучшее испытание:
  Значение:  0.33949783413803164
  Параметры: 
    iterations: 932
    learning_rate: 0.004515936826730637
    depth: 4
    l2_leaf_reg: 0.6692513671003254
    bootstrap_type: Bayesian
    random_strength: 0.0001267831596207871
    bagging_temperature: 0.7834511947085497
    od_type: IncToDec
    auto_class_weights: SqrtBalanced


In [46]:
catboost_params = {
    'learning_rate':  0.004515936826730637,
    'depth': 4,
    'l2_leaf_reg': 0.6692513671003254,
    'bootstrap_type': 'Bayesian',
    'random_strength': 0.0001267831596207871,
    'bagging_temperature':  0.7834511947085497,
    'od_type': 'IncToDec',
    'task_type': 'GPU',
}

model = CatBoostRegressor(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.2233795	test: 0.2233889	best: 0.2233889 (0)	total: 19.2ms	remaining: 19.2s
1:	learn: 0.2232807	test: 0.2233041	best: 0.2233041 (1)	total: 40.4ms	remaining: 20.1s
2:	learn: 0.2231924	test: 0.2232234	best: 0.2232234 (2)	total: 59.2ms	remaining: 19.7s
3:	learn: 0.2230933	test: 0.2231341	best: 0.2231341 (3)	total: 79.8ms	remaining: 19.9s
4:	learn: 0.2229965	test: 0.2230454	best: 0.2230454 (4)	total: 99.1ms	remaining: 19.7s
5:	learn: 0.2229009	test: 0.2229607	best: 0.2229607 (5)	total: 118ms	remaining: 19.6s
6:	learn: 0.2228045	test: 0.2228761	best: 0.2228761 (6)	total: 137ms	remaining: 19.5s
7:	learn: 0.2227088	test: 0.2227874	best: 0.2227874 (7)	total: 157ms	remaining: 19.4s
8:	learn: 0.2226155	test: 0.2227044	best: 0.2227044 (8)	total: 176ms	remaining: 19.4s
9:	learn: 0.2225319	test: 0.2226297	best: 0.2226297 (9)	total: 195ms	remaining: 19.3s
10:	learn: 0.2224380	test: 0.2225452	best: 0.2225452 (10)	total: 219ms	remaining: 19.7s
11:	learn: 0.2223462	test: 0.2224631	best: 0.22

<catboost.core.CatBoostRegressor at 0x7cf6446669e0>

In [47]:
import pickle

pickle.dump(model, open('Other_hazards.pkl', 'wb'))