# Regression. Part 2

---
Author: Durkin Anatoliy

Updated: 31.03.2025

---
В данном ноутбуке ...

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
def metrics(true, pred):
    print('R2:', r2_score(true, pred))
    print('MAE:', mean_absolute_error(true, pred))
    print('RMSE:', mean_squared_error(true, pred)**0.5)

In [None]:
df = pd.read_csv('house_price_regression_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['House_Price'], axis=1), df['House_Price'], test_size=0.2, random_state=42)

# Модели

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression().fit(X_train, y_train)

In [None]:
metrics(y_test, lr.predict(X_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor().fit(X_train, y_train)

In [None]:
metrics(y_test, rf.predict(X_test))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
metrics(y_test, gbr.predict(X_test))

In [None]:
from catboost import CatBoostRegressor

In [None]:
cb = CatBoostRegressor().fit(X_train, y_train, verbose=False)

In [None]:
metrics(y_test, cb.predict(X_test))

# Отбор признаков

## Прямой отбор

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
cb = CatBoostRegressor(verbose=False)
sfs = SequentialFeatureSelector(cb, direction='forward')
sfs.fit(X_train, y_train)

In [None]:
sfs.get_support()

In [None]:
sfs.get_params()

In [None]:
sfs.transform(X_test)

## Последовательный отбор

In [None]:
cb = CatBoostRegressor(verbose=False)
sfs = SequentialFeatureSelector(cb, direction='backward')
sfs.fit(X_train, y_train)

In [None]:
sfs.get_support()

## Исчерпывающий выбор

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector

In [None]:
cb = CatBoostRegressor(verbose=False)
efs = ExhaustiveFeatureSelector(cb, min_features=1, max_features=7, scoring='r2', cv=5)
efs.fit(X_train, y_train)

In [None]:
efs.best_score_

In [None]:
efs.best_feature_names_

In [None]:
efs.subsets_

# Pipeline

## Pipeline as transformer

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [None]:
simple_imputer = SimpleImputer(strategy='mean')
scaler = MinMaxScaler()

In [None]:
pipe = Pipeline(steps=[('imputer', simple_imputer), ('scaler', scaler)])

In [None]:
pipe.fit(X_train)

In [None]:
pipe.transform(X_test)

## Pipeline as model

In [None]:
model = LinearRegression()

In [None]:
pipe = Pipeline(steps=[('imputer', simple_imputer), ('scaler', scaler), ('model', model)])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
metrics(y_test, pipe.predict(X_test))

## Обработка разнородных данных

In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['charges'], axis=1), df['charges'], test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
col_transformer = ColumnTransformer([('num_preproc', MinMaxScaler(), [x for x in X_train.columns if X_train[x].dtype!='object']),
                                     ('cat_preproc', OneHotEncoder(dtype='int'), [x for x in X_train.columns if X_train[x].dtype=='object'])])

In [None]:
pipe = Pipeline([('preproc', col_transformer), ('LR', LinearRegression())])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
metrics(y_test, pipe.predict(X_test))

## Подбор гиперпараметров

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = Pipeline([('preproc', col_transformer), ('CatBoost', CatBoostRegressor(verbose=False))])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

In [None]:
param_grid = {
    "CatBoost__iterations": [1000, 2000],
    "CatBoost__learning_rate": [0.01, 0.05],
    "CatBoost__depth": [3, 5, 7]
}
search = GridSearchCV(pipe, param_grid)

In [None]:
search.fit(X_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_