# Read data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
from sklearn.metrics import max_error

In [34]:
msk_studio = pd.read_csv('Data_low/msk_studio.csv', sep=';')
msk_1 = pd.read_csv('Data_low/msk_1.csv', sep=';')
msk_2 = pd.read_csv('Data_low/msk_2.csv', sep=';')
msk_3 = pd.read_csv('Data_low/msk_3.csv', sep=';')
msk_4 = pd.read_csv('Data_low/msk_4.csv', sep=';')
msk_5 = pd.read_csv('Data_low/msk_5.csv', sep=';')

In [28]:
kzn_studio = pd.read_csv('Data_low/kzn_studio.csv', sep=';')
kzn_1 = pd.read_csv('Data_low/kzn_1.csv', sep=';')
kzn_2 = pd.read_csv('Data_low/kzn_2.csv', sep=';')
kzn_3 = pd.read_csv('Data_low/kzn_3.csv', sep=';')
kzn_4 = pd.read_csv('Data_low/kzn_4.csv', sep=';')

# KZN

In [25]:
# Здесь для каждой х-комнатной квартиры одна модель

kzn_studio = kzn_studio[kzn_studio['rooms_count'] == 1]
kzn_1 = kzn_1[kzn_1['rooms_count'] == 1]
kzn_2 = kzn_2[kzn_2['rooms_count'] == 2]
kzn_3 = kzn_3[kzn_3['rooms_count'] == 3]
kzn_4 = kzn_4[kzn_4['rooms_count'] == 4]

kzn_list = [kzn_studio, kzn_1, kzn_2, kzn_3, kzn_4]

for kzn in kzn_list:

    print(kzn.describe())
    kzn = kzn[kzn.total_meters > kzn.total_meters.quantile(q=0.03)]
    kzn = kzn[kzn.total_meters < kzn.total_meters.quantile(q=0.97)]
    kzn = kzn[kzn.price > kzn.price.quantile(q=0.01)]
    kzn = kzn[kzn.price < kzn.price.quantile(q=0.99)]
    print(kzn.describe())
    print('-----------------------------------------------------------------------------------------------------')


    
    kzn = kzn.drop(columns=['author', 'author_type', 'url', 'residential_complex', 'house_number'])
    X = kzn.drop(columns=['location', 'deal_type', 'accommodation_type', 'street', 'price', 'floors_count'])
    y = kzn['price']
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=0.1,
                                                          random_state=42)

    #OneHotEncoder
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[['district', 'underground']]))
    OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[['district', 'underground']]))
    OH_X_train.index = X_train.index
    OH_X_valid.index = X_valid.index
    num_X_train = X_train.drop(columns=['district', 'underground'])
    num_X_valid = X_valid.drop(columns=['district', 'underground'])
    X_train_ = pd.concat([num_X_train, OH_X_train], axis=1)
    X_valid_ = pd.concat([num_X_valid, OH_X_valid], axis=1)
    X_train_.columns = X_train_.columns.astype(str)
    X_valid_.columns = X_valid_.columns.astype(str)


    #MODEL
    model_CBR = CatBoostRegressor(iterations=10000, 
                                  learning_rate=0.1)
    model_CBR.fit(X_train_, y_train, 
                  verbose=False
                 )
    pred = model_CBR.predict(X_valid_)

    print('MSE = ', mean_squared_error(y_valid, pred))
    print('Max Error = ', max_error(y_valid, pred))
    print('median_error = ', median_absolute_error(y_valid, pred))
    print('r2_score = ', r2_score(y_valid, pred))
    print('----------------------------------------------------------------------------------------------------------------------------------')

            floor  floors_count  rooms_count  total_meters         price
count  479.000000    479.000000        479.0    479.000000  4.790000e+02
mean     6.887265     13.536534          1.0     26.635887  5.801324e+06
std      5.480645      7.898561          0.0     17.205651  5.074796e+06
min      1.000000      2.000000          1.0      9.300000  1.350000e+06
25%      3.000000      5.000000          1.0     18.000000  3.300000e+06
50%      5.000000     12.000000          1.0     25.600000  5.400000e+06
75%     10.000000     20.000000          1.0     30.470000  6.865000e+06
max     25.000000     37.000000          1.0    283.000000  8.550000e+07
            floor  floors_count  rooms_count  total_meters         price
count  437.000000    437.000000        437.0    437.000000  4.370000e+02
mean     7.137300     13.951945          1.0     25.129245  5.321779e+06
std      5.558645      7.765483          0.0      7.194107  2.208159e+06
min      1.000000      2.000000          1.0     12

In [31]:
# здесь одна модель на ввсе виды квартир казани

kzn_studio = kzn_studio[kzn_studio['rooms_count'] == 0]
kzn_1 = kzn_1[kzn_1['rooms_count'] == 1]
kzn_2 = kzn_2[kzn_2['rooms_count'] == 2]
kzn_3 = kzn_3[kzn_3['rooms_count'] == 3]
kzn_4 = kzn_4[kzn_4['rooms_count'] == 4]

kzn_studio = kzn_studio[kzn_studio.total_meters > kzn_studio.total_meters.quantile(q=0.03)]
kzn_studio = kzn_studio[kzn_studio.total_meters < kzn_studio.total_meters.quantile(q=0.97)]
kzn_studio = kzn_studio[kzn_studio.price > kzn_studio.price.quantile(q=0.01)]
kzn_studio = kzn_studio[kzn_studio.price < kzn_studio.price.quantile(q=0.99)]

kzn_1 = kzn_1[kzn_1.total_meters > kzn_1.total_meters.quantile(q=0.03)]
kzn_1 = kzn_1[kzn_1.total_meters < kzn_1.total_meters.quantile(q=0.97)]
kzn_1 = kzn_1[kzn_1.price > kzn_1.price.quantile(q=0.01)]
kzn_1 = kzn_1[kzn_1.price < kzn_1.price.quantile(q=0.99)]

kzn_2 = kzn_2[kzn_2.total_meters > kzn_2.total_meters.quantile(q=0.03)]
kzn_2 = kzn_2[kzn_2.total_meters < kzn_2.total_meters.quantile(q=0.97)]
kzn_2 = kzn_2[kzn_2.price > kzn_2.price.quantile(q=0.01)]
kzn_2 = kzn_2[kzn_2.price < kzn_2.price.quantile(q=0.99)]

kzn_3 = kzn_3[kzn_3.total_meters > kzn_3.total_meters.quantile(q=0.03)]
kzn_3 = kzn_3[kzn_3.total_meters < kzn_3.total_meters.quantile(q=0.97)]
kzn_3 = kzn_3[kzn_3.price > kzn_3.price.quantile(q=0.01)]
kzn_3 = kzn_3[kzn_3.price < kzn_3.price.quantile(q=0.99)]

kzn_4 = kzn_4[kzn_4.total_meters > kzn_4.total_meters.quantile(q=0.03)]
kzn_4 = kzn_4[kzn_4.total_meters < kzn_4.total_meters.quantile(q=0.97)]
kzn_4 = kzn_4[kzn_4.price > kzn_4.price.quantile(q=0.01)]
kzn_4 = kzn_4[kzn_4.price < kzn_4.price.quantile(q=0.99)]



kzn = pd.concat([kzn_studio, kzn_1, kzn_2, kzn_3, kzn_4], ignore_index=True)
kzn = kzn.drop(columns=['author', 'author_type', 'url', 'residential_complex', 'house_number'])

X = kzn.drop(columns=['location', 'deal_type', 'accommodation_type', 'street', 'price', 'floors_count'])
y = kzn['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.1,
                                                    random_state=42)



OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[['district', 'underground']]))
OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[['district', 'underground']]))
OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index
num_X_train = X_train.drop(columns=['district', 'underground'])
num_X_valid = X_valid.drop(columns=['district', 'underground'])
X_train_ = pd.concat([num_X_train, OH_X_train], axis=1)
X_valid_ = pd.concat([num_X_valid, OH_X_valid], axis=1)
X_train_.columns = X_train_.columns.astype(str)
X_valid_.columns = X_valid_.columns.astype(str)



model_CBR = CatBoostRegressor(iterations=10000, 
                          learning_rate=0.08)
model_CBR.fit(X_train_, y_train, 
              verbose=True
             )

pred = model_CBR.predict(X_valid_)



print('Max Error (макксимальное отклонение от цены) = ', max_error(y_valid, pred))
print('Median_error (медиана отклонения) = ', median_absolute_error(y_valid, pred))
print('r2_score = ', r2_score(y_valid, pred))

1----------------------------------------------------
2----------------------------------------------------
3----------------------------------------------------
0:	learn: 5480091.5243813	total: 4.7ms	remaining: 47s
1:	learn: 5217564.7688953	total: 8.21ms	remaining: 41s
2:	learn: 5007314.7537448	total: 12ms	remaining: 40s
3:	learn: 4796294.9653323	total: 15.5ms	remaining: 38.7s
4:	learn: 4601746.3966344	total: 18.6ms	remaining: 37.2s
5:	learn: 4431673.2246308	total: 22.2ms	remaining: 37s
6:	learn: 4283070.8230104	total: 26.8ms	remaining: 38.3s
7:	learn: 4141288.5984917	total: 30.8ms	remaining: 38.5s
8:	learn: 4016515.4043257	total: 35.5ms	remaining: 39.4s
9:	learn: 3906810.9897563	total: 40.3ms	remaining: 40.3s
10:	learn: 3801012.1340414	total: 45.1ms	remaining: 40.9s
11:	learn: 3710666.6726677	total: 50.1ms	remaining: 41.7s
12:	learn: 3629658.9876771	total: 55ms	remaining: 42.3s
13:	learn: 3551894.8485103	total: 59.9ms	remaining: 42.7s
14:	learn: 3480745.2985483	total: 64.7ms	remainin

# MSK

In [38]:
# отдельная модель для каждой х-комнатной квартиры

msk_studio = msk_studio[msk_studio['rooms_count'] == 1]
msk_1 = msk_1[msk_1['rooms_count'] == 1]
msk_2 = msk_2[msk_2['rooms_count'] == 2]
msk_3 = msk_3[msk_3['rooms_count'] == 3]
msk_4 = msk_4[msk_4['rooms_count'] == 4]
msk_5 = msk_5[msk_5['rooms_count'] == 5]

msk_list = [msk_studio, msk_1, msk_2, msk_3, msk_4, msk_5]

for item in msk_list:
    print(item.describe())
    # item = item[item.price < item.price.quantile(q=0.95)]
    # item = item[item.price > item.price.quantile(q=0.05)]
    # item = item[item.total_meters > item.total_meters.quantile(q=0.05)]
    item = item[item.total_meters > item.total_meters.quantile(q=0.03)]
    item = item[item.total_meters < item.total_meters.quantile(q=0.97)]
    item = item[item.price > item.price.quantile(q=0.01)]
    item = item[item.price < item.price.quantile(q=0.99)]
    print(item.describe())
    print('-----------------------------------------------------------------------------------------------------')


    
    X = item.drop(columns=['location', 'deal_type', 'accommodation_type', 'street', 'price', 'floors_count', 'author', 'author_type', 'url', 'residential_complex', 'house_number'])
    y = item['price']
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                          test_size=0.1,
                                                          random_state=42)

    #OneHotEncoder
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[['district', 'underground']]))
    OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[['district', 'underground']]))
    OH_X_train.index = X_train.index
    OH_X_valid.index = X_valid.index
    num_X_train = X_train.drop(columns=['district', 'underground'])
    num_X_valid = X_valid.drop(columns=['district', 'underground'])
    X_train_ = pd.concat([num_X_train, OH_X_train], axis=1)
    X_valid_ = pd.concat([num_X_valid, OH_X_valid], axis=1)
    X_train_.columns = X_train_.columns.astype(str)
    X_valid_.columns = X_valid_.columns.astype(str)


    #MODEL
    model_CBR = CatBoostRegressor(iterations=10000, 
                                  learning_rate=0.1)
    model_CBR.fit(X_train_, y_train, 
                  verbose=False
                 )
    pred = model_CBR.predict(X_valid_)

    print('Max Error = ', max_error(y_valid, pred))
    print('median_error = ', median_absolute_error(y_valid, pred))
    print('r2_score = ', r2_score(y_valid, pred))
    print('----------------------------------------------------------------------------------------------------------------------------------')

             floor  floors_count  rooms_count  total_meters         price
count  1163.000000   1163.000000       1163.0   1163.000000  1.163000e+03
mean     10.011178     21.766982          1.0     25.067231  9.958501e+06
std       8.551195     11.849670          0.0      5.305680  7.091135e+06
min       1.000000      2.000000          1.0     11.200000  2.950000e+06
25%       3.000000     14.000000          1.0     21.900000  7.245258e+06
50%       8.000000     19.000000          1.0     24.000000  8.662897e+06
75%      14.000000     28.000000          1.0     26.780000  1.098126e+07
max      43.000000     85.000000          1.0     61.810000  1.234300e+08
             floor  floors_count  rooms_count  total_meters         price
count  1071.000000   1071.000000       1071.0   1071.000000  1.071000e+03
mean     10.332400     22.128852          1.0     24.732605  9.401607e+06
std       8.589913     11.158092          0.0      3.722136  2.814450e+06
min       1.000000      4.000000      

KeyboardInterrupt: 

In [32]:
# здесь одна модель на все виды квартир Москвы

kzn_studio = pd.read_csv('Data_low/msk_studio.csv', sep=';')
kzn_1 = pd.read_csv('Data_low/msk_1.csv', sep=';')
kzn_2 = pd.read_csv('Data_low/msk_2.csv', sep=';')
kzn_3 = pd.read_csv('Data_low/msk_3.csv', sep=';')
kzn_4 = pd.read_csv('Data_low/msk_4.csv', sep=';')
kzn_5 = pd.read_csv('Data_low/msk_5.csv', sep=';')


kzn_studio = kzn_studio[kzn_studio['rooms_count'] == 0]
kzn_1 = kzn_1[kzn_1['rooms_count'] == 1]
kzn_2 = kzn_2[kzn_2['rooms_count'] == 2]
kzn_3 = kzn_3[kzn_3['rooms_count'] == 3]
kzn_4 = kzn_4[kzn_4['rooms_count'] == 4]
kzn_5 = kzn_5[kzn_5['rooms_count'] == 5]

kzn_studio = kzn_studio[kzn_studio.total_meters > kzn_studio.total_meters.quantile(q=0.03)]
kzn_studio = kzn_studio[kzn_studio.total_meters < kzn_studio.total_meters.quantile(q=0.97)]
kzn_studio = kzn_studio[kzn_studio.price > kzn_studio.price.quantile(q=0.01)]
kzn_studio = kzn_studio[kzn_studio.price < kzn_studio.price.quantile(q=0.99)]

kzn_1 = kzn_1[kzn_1.total_meters > kzn_1.total_meters.quantile(q=0.03)]
kzn_1 = kzn_1[kzn_1.total_meters < kzn_1.total_meters.quantile(q=0.97)]
kzn_1 = kzn_1[kzn_1.price > kzn_1.price.quantile(q=0.01)]
kzn_1 = kzn_1[kzn_1.price < kzn_1.price.quantile(q=0.99)]

kzn_2 = kzn_2[kzn_2.total_meters > kzn_2.total_meters.quantile(q=0.03)]
kzn_2 = kzn_2[kzn_2.total_meters < kzn_2.total_meters.quantile(q=0.97)]
kzn_2 = kzn_2[kzn_2.price > kzn_2.price.quantile(q=0.01)]
kzn_2 = kzn_2[kzn_2.price < kzn_2.price.quantile(q=0.99)]

kzn_3 = kzn_3[kzn_3.total_meters > kzn_3.total_meters.quantile(q=0.03)]
kzn_3 = kzn_3[kzn_3.total_meters < kzn_3.total_meters.quantile(q=0.97)]
kzn_3 = kzn_3[kzn_3.price > kzn_3.price.quantile(q=0.01)]
kzn_3 = kzn_3[kzn_3.price < kzn_3.price.quantile(q=0.99)]

kzn_4 = kzn_4[kzn_4.total_meters > kzn_4.total_meters.quantile(q=0.03)]
kzn_4 = kzn_4[kzn_4.total_meters < kzn_4.total_meters.quantile(q=0.97)]
kzn_4 = kzn_4[kzn_4.price > kzn_4.price.quantile(q=0.01)]
kzn_4 = kzn_4[kzn_4.price < kzn_4.price.quantile(q=0.99)]

kzn_5 = kzn_5[kzn_5.total_meters > kzn_5.total_meters.quantile(q=0.03)]
kzn_5 = kzn_5[kzn_5.total_meters < kzn_5.total_meters.quantile(q=0.97)]
kzn_5 = kzn_5[kzn_5.price > kzn_5.price.quantile(q=0.01)]
kzn_5 = kzn_5[kzn_5.price < kzn_5.price.quantile(q=0.99)]



kzn = pd.concat([kzn_studio, kzn_1, kzn_2, kzn_3, kzn_4, kzn_5], ignore_index=True)
kzn = kzn.drop(columns=['author', 'author_type', 'url', 'residential_complex', 'house_number'])

X = kzn.drop(columns=['location', 'deal_type', 'accommodation_type', 'street', 'price', 'floors_count'])
y = kzn['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.1,
                                                    random_state=42)



OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[['district', 'underground']]))
OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[['district', 'underground']]))
OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index
num_X_train = X_train.drop(columns=['district', 'underground'])
num_X_valid = X_valid.drop(columns=['district', 'underground'])
X_train_ = pd.concat([num_X_train, OH_X_train], axis=1)
X_valid_ = pd.concat([num_X_valid, OH_X_valid], axis=1)
X_train_.columns = X_train_.columns.astype(str)
X_valid_.columns = X_valid_.columns.astype(str)



model_CBR = CatBoostRegressor(iterations=10000, 
                          learning_rate=0.08)
model_CBR.fit(X_train_, y_train, 
              verbose=True
             )

pred = model_CBR.predict(X_valid_)



print('Max Error (макксимальное отклонение от цены) = ', max_error(y_valid, pred))
print('Median_error (медиана отклонения) = ', median_absolute_error(y_valid, pred))
print('r2_score = ', r2_score(y_valid, pred))

0:	learn: 93258831.3219874	total: 8.52ms	remaining: 1m 25s
1:	learn: 88514905.5130690	total: 17.4ms	remaining: 1m 26s
2:	learn: 84198823.2698543	total: 26.5ms	remaining: 1m 28s
3:	learn: 80261459.8193142	total: 35.7ms	remaining: 1m 29s
4:	learn: 76867961.5793368	total: 45.1ms	remaining: 1m 30s
5:	learn: 73880455.2477050	total: 53.5ms	remaining: 1m 29s
6:	learn: 71107419.7578098	total: 62.1ms	remaining: 1m 28s
7:	learn: 68658743.3938707	total: 70.6ms	remaining: 1m 28s
8:	learn: 66330945.0467937	total: 79.9ms	remaining: 1m 28s
9:	learn: 64347227.5784728	total: 88.8ms	remaining: 1m 28s
10:	learn: 62606063.8560595	total: 97.2ms	remaining: 1m 28s
11:	learn: 60950762.5240571	total: 106ms	remaining: 1m 28s
12:	learn: 59385090.6115339	total: 115ms	remaining: 1m 27s
13:	learn: 58030376.9065676	total: 123ms	remaining: 1m 27s
14:	learn: 56857249.4367879	total: 131ms	remaining: 1m 26s
15:	learn: 55785255.4156697	total: 139ms	remaining: 1m 26s
16:	learn: 54812336.8834174	total: 148ms	remaining: 1m 

In [33]:
# здесь одна модель на все виды квартир Москвы (без учета станций метро)

kzn_studio = pd.read_csv('Data_low/msk_studio.csv', sep=';')
kzn_1 = pd.read_csv('Data_low/msk_1.csv', sep=';')
kzn_2 = pd.read_csv('Data_low/msk_2.csv', sep=';')
kzn_3 = pd.read_csv('Data_low/msk_3.csv', sep=';')
kzn_4 = pd.read_csv('Data_low/msk_4.csv', sep=';')
kzn_5 = pd.read_csv('Data_low/msk_5.csv', sep=';')


kzn_studio = kzn_studio[kzn_studio['rooms_count'] == 0]
kzn_1 = kzn_1[kzn_1['rooms_count'] == 1]
kzn_2 = kzn_2[kzn_2['rooms_count'] == 2]
kzn_3 = kzn_3[kzn_3['rooms_count'] == 3]
kzn_4 = kzn_4[kzn_4['rooms_count'] == 4]
kzn_5 = kzn_5[kzn_5['rooms_count'] == 5]

kzn_studio = kzn_studio[kzn_studio.total_meters > kzn_studio.total_meters.quantile(q=0.03)]
kzn_studio = kzn_studio[kzn_studio.total_meters < kzn_studio.total_meters.quantile(q=0.97)]
kzn_studio = kzn_studio[kzn_studio.price > kzn_studio.price.quantile(q=0.01)]
kzn_studio = kzn_studio[kzn_studio.price < kzn_studio.price.quantile(q=0.99)]

kzn_1 = kzn_1[kzn_1.total_meters > kzn_1.total_meters.quantile(q=0.03)]
kzn_1 = kzn_1[kzn_1.total_meters < kzn_1.total_meters.quantile(q=0.97)]
kzn_1 = kzn_1[kzn_1.price > kzn_1.price.quantile(q=0.01)]
kzn_1 = kzn_1[kzn_1.price < kzn_1.price.quantile(q=0.99)]

kzn_2 = kzn_2[kzn_2.total_meters > kzn_2.total_meters.quantile(q=0.03)]
kzn_2 = kzn_2[kzn_2.total_meters < kzn_2.total_meters.quantile(q=0.97)]
kzn_2 = kzn_2[kzn_2.price > kzn_2.price.quantile(q=0.01)]
kzn_2 = kzn_2[kzn_2.price < kzn_2.price.quantile(q=0.99)]

kzn_3 = kzn_3[kzn_3.total_meters > kzn_3.total_meters.quantile(q=0.03)]
kzn_3 = kzn_3[kzn_3.total_meters < kzn_3.total_meters.quantile(q=0.97)]
kzn_3 = kzn_3[kzn_3.price > kzn_3.price.quantile(q=0.01)]
kzn_3 = kzn_3[kzn_3.price < kzn_3.price.quantile(q=0.99)]

kzn_4 = kzn_4[kzn_4.total_meters > kzn_4.total_meters.quantile(q=0.03)]
kzn_4 = kzn_4[kzn_4.total_meters < kzn_4.total_meters.quantile(q=0.97)]
kzn_4 = kzn_4[kzn_4.price > kzn_4.price.quantile(q=0.01)]
kzn_4 = kzn_4[kzn_4.price < kzn_4.price.quantile(q=0.99)]

kzn_5 = kzn_5[kzn_5.total_meters > kzn_5.total_meters.quantile(q=0.03)]
kzn_5 = kzn_5[kzn_5.total_meters < kzn_5.total_meters.quantile(q=0.97)]
kzn_5 = kzn_5[kzn_5.price > kzn_5.price.quantile(q=0.01)]
kzn_5 = kzn_5[kzn_5.price < kzn_5.price.quantile(q=0.99)]



kzn = pd.concat([kzn_studio, kzn_1, kzn_2, kzn_3, kzn_4, kzn_5], ignore_index=True)
kzn = kzn.drop(columns=['author', 'author_type', 'url', 'residential_complex', 'house_number', 'underground'])

X = kzn.drop(columns=['location', 'deal_type', 'accommodation_type', 'street', 'price', 'floors_count'])
y = kzn['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.1,
                                                    random_state=42)



OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[['district']]))
OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[['district']]))
OH_X_train.index = X_train.index
OH_X_valid.index = X_valid.index
num_X_train = X_train.drop(columns=['district'])
num_X_valid = X_valid.drop(columns=['district'])
X_train_ = pd.concat([num_X_train, OH_X_train], axis=1)
X_valid_ = pd.concat([num_X_valid, OH_X_valid], axis=1)
X_train_.columns = X_train_.columns.astype(str)
X_valid_.columns = X_valid_.columns.astype(str)



model_CBR = CatBoostRegressor(iterations=10000, 
                          learning_rate=0.08)
model_CBR.fit(X_train_, y_train, 
              verbose=True
             )

pred = model_CBR.predict(X_valid_)



print('Max Error (макксимальное отклонение от цены) = ', max_error(y_valid, pred))
print('Median_error (медиана отклонения) = ', median_absolute_error(y_valid, pred))
print('r2_score = ', r2_score(y_valid, pred))

0:	learn: 93220622.0192569	total: 6.17ms	remaining: 1m 1s
1:	learn: 88617903.7235666	total: 12.1ms	remaining: 1m
2:	learn: 84289730.6990428	total: 18ms	remaining: 1m
3:	learn: 80443405.5133658	total: 23.6ms	remaining: 59s
4:	learn: 76840700.6223277	total: 29ms	remaining: 58s
5:	learn: 73609816.7762149	total: 34.9ms	remaining: 58.1s
6:	learn: 70804244.0695821	total: 40.2ms	remaining: 57.4s
7:	learn: 68379753.5873756	total: 45.7ms	remaining: 57.1s
8:	learn: 66149758.2616213	total: 51.1ms	remaining: 56.7s
9:	learn: 64190248.4696670	total: 56.4ms	remaining: 56.4s
10:	learn: 62345290.4285696	total: 61.8ms	remaining: 56.2s
11:	learn: 60721339.0446743	total: 67.5ms	remaining: 56.2s
12:	learn: 59091484.2916651	total: 73.2ms	remaining: 56.2s
13:	learn: 57913698.0592453	total: 78.6ms	remaining: 56.1s
14:	learn: 56684958.5816208	total: 84ms	remaining: 55.9s
15:	learn: 55680085.6379114	total: 89.4ms	remaining: 55.8s
16:	learn: 54837956.6462080	total: 95ms	remaining: 55.8s
17:	learn: 53914446.40626