<a href="https://colab.research.google.com/github/sashavorot/SHIFT-intensive/blob/main/deep_n_diggers_2nd_note_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Предобработка данных

### Импорт библиотек

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
pip install catboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor

### Предобрабока

In [None]:
df = pd.read_csv('./gdrive/MyDrive/SHIFT_SUMMER/train.csv', index_col=0)
x = df.drop(columns=['price_doc'])
y = df['price_doc']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Нормализация в train и test
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

pt = PowerTransformer(method='yeo-johnson')
x_train_normalized = pt.fit_transform(x_train_scaled)
x_test_normalized = pt.transform(x_test_scaled)

# Добавление полиномиальных признаков (квадратов и попарных произведений друг с другом)
poly = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly.fit_transform(x_train_normalized)
x_test_poly = poly.transform(x_test_normalized)

## Предсказание

### Boosting


In [None]:
%%time
model = CatBoostRegressor()
model.fit(x_train_poly, y_train)

In [None]:
# Предсказание на обучающей выборке
y_pred_cb_train = model.predict(x_train_poly)

r2_train_cb = r2_score(y_train, y_pred_cb_train)
mse_train_cb = mean_squared_error(y_train, y_pred_cb_train)
rmse_train_cb = np.sqrt(mse_train_cb)
mae_train_cb = mean_absolute_error(y_train, y_pred_cb_train)

In [None]:
# Предсказание на тестовой выборке
y_pred_cb = model.predict(x_test_poly)

r2_test_cb = r2_score(y_test, y_pred_cb)
mse_test_cb = mean_squared_error(y_test, y_pred_cb)
rmse_test_cb = np.sqrt(mse_test_cb)
mae_test_cb = mean_absolute_error(y_test, y_pred_cb)

### Результаты

In [None]:
pd.DataFrame({'Train': [r2_train_cb, mse_train_cb, rmse_train_cb, mae_train_cb],
              'Test': [r2_test_cb, mse_test_cb, rmse_test_cb, mae_test_cb]
             },
              index=['R2', 'MSE', 'RMSE', 'MAE'])

### Submit_load

In [None]:
submission = pd.read_csv('./gdrive/MyDrive/SHIFT_SUMMER/starting_k/submission.csv', index_col=0)
new_data = pd.read_csv('./gdrive/MyDrive/SHIFT_SUMMER/starting_k/test.csv', index_col=0)
new_data_scaled = scaler.transform(new_data[x.columns])
new_data_normalized = pt.transform(new_data_scaled)
new_data_poly = poly.transform(new_data_normalized)
new_predictions = model.predict(new_data_poly)

submission['price_doc'] = new_predictions

submission.to_csv('./gdrive/MyDrive/SHIFT_SUMMER/submission.csv')