# Building a model

## Imports and extensions

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import pandas as pd
from catboost import CatBoostRegressor
from dotenv import load_dotenv
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sqlalchemy import create_engine

## Connecting to DB

In [3]:
load_dotenv()

True

In [4]:
dst_host = os.environ.get('DB_DESTINATION_HOST')
dst_port = os.environ.get('DB_DESTINATION_PORT')
dst_username = os.environ.get('DB_DESTINATION_USER')
dst_password = os.environ.get('DB_DESTINATION_PASSWORD')
dst_db = os.environ.get('DB_DESTINATION_NAME')

In [5]:
dst_conn = create_engine(
    f'postgresql://{dst_username}:{dst_password}@{dst_host}:{dst_port}/{dst_db}'
    )

## Pulling the data

In [6]:
try:
    data = pd.read_sql(f'SELECT * FROM flats_clean', dst_conn, index_col='flat_id')
except:
    print(f"Relation does not exist.")
else:
    data.drop(['id', 'build_year'], axis=1, inplace=True)
    display(data.head())

Unnamed: 0_level_0,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,price
flat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
130624,4,55.693623,37.593281,2.64,360,16,True,5,12.0,33.799999,2,False,False,57.799999,19777000
130625,4,55.87133,37.664104,2.7,191,12,True,12,6.68,36.849998,2,False,False,45.810001,11500000
130626,1,55.813946,37.599911,3.0,223,7,True,6,10.0,34.0,2,False,False,58.0,12750000
130627,4,55.606606,37.740093,2.7,469,12,True,8,7.0,30.0,2,False,False,47.0,7500000
130629,4,55.681316,37.665867,2.64,208,17,True,6,16.0,30.0,2,False,False,57.0,12990000


## Columns analysis

In [7]:
data.dtypes

building_type_int      int64
latitude             float64
longitude            float64
ceiling_height       float64
flats_count            int64
floors_total           int64
has_elevator            bool
floor                  int64
kitchen_area         float64
living_area          float64
rooms                  int64
is_apartment            bool
studio                  bool
total_area           float64
price                  int64
dtype: object

Рассмотрим все целочисленные колонки, в которых может содержаться категориальная структура:

In [8]:
cat_features = data.select_dtypes('int64').drop('price', axis=1)
cat_features_uniq_counts = cat_features.nunique()
cat_features_uniq_counts.sort_values(ascending=False)

flats_count          588
floors_total          29
floor                 20
building_type_int      6
rooms                  5
dtype: int64

Будем считать категориальными те колонки, в которых меньше 11 категорий:

In [9]:
cat_features_to_scale = cat_features_uniq_counts[cat_features_uniq_counts > 10].index.tolist()
cat_features_to_scale

['flats_count', 'floors_total', 'floor']

In [10]:
num_features = data.select_dtypes('float64').columns.tolist()
num_features

['latitude',
 'longitude',
 'ceiling_height',
 'kitchen_area',
 'living_area',
 'total_area']

In [11]:
features_to_scale = num_features + cat_features_to_scale
features_to_scale

['latitude',
 'longitude',
 'ceiling_height',
 'kitchen_area',
 'living_area',
 'total_area',
 'flats_count',
 'floors_total',
 'floor']

## Training CatBoost

In [12]:
preprocessor = make_column_transformer(
    (StandardScaler(), features_to_scale),
    remainder='passthrough',
    verbose_feature_names_out=False,
)

In [13]:
model = CatBoostRegressor(verbose=False)
pipe = make_pipeline(preprocessor, model)

In [14]:
features = data.drop('price', axis=1)
target = data['price']

In [15]:
pipe.fit(features, target)
pipe.predict(features)

array([16527289.10589424, 10794725.73972196, 16743355.42170538, ...,
        7362527.12075652, 10627543.88302161,  5401518.21288675])

## Cross-validation

In [16]:
cv_res = cross_validate(
    pipe,
    features,
    target,
    cv=5,
    n_jobs=-1,
    scoring='neg_root_mean_squared_error',
)

for key, value in cv_res.items():
        cv_res[key] = round(value.mean(), 3)

cv_res

{'fit_time': 24.327, 'score_time': 0.102, 'test_score': -2415562.858}

In [17]:
dst_conn.dispose()