In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Подключение библиотек и скриптов

In [None]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore') #отключили предупреждения

In [None]:
#единый шрифт для графиков
matplotlib.rcParams.update({'font.size': 12})

In [None]:
matplotlib.rcParams.update({'font.size': 14})

In [None]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

In [None]:
#Пути к файлам

TRAIN_DATASET_PATH = '../input/real-estate-price-prediction/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction/test.csv'

* Загрузка данных
* Описание датасета

* Id - идентификационный номер квартиры
* DistrictId - идентификационный номер района
* Rooms - количество комнат
* Square - общая площадь квартиры
* LifeSquare - жилая площадь
* KitchenSquare - площадь кухни
* Floor - этаж
* HouseFloor - количество этажей в доме
* HouseYear - год постройки дома
* Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
* Social_1, Social_2, Social_3 - социальные показатели местности
* Healthcare_1, Healthcare_2 - показатели местности, связанные с здравоохранением
* Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
* Price - цена квартиры

In [None]:
#считываем данные из train.csv
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df.tail(10)

In [None]:
#размерность train датасета (количество объектов, количество признаков)
print(train_df.shape)
print("Всего квартир:", train_df.shape[0])
print("Всего признаков:", train_df.shape[1])

In [None]:
#считываем данные из test.csv
test_df = pd.read_csv(TEST_DATASET_PATH)
test_df.head(10)

In [None]:
#размерность test датасета (количество объектов, количество признаков)
print(test_df.shape)
print("Всего квартир:", test_df.shape[0])
print("Всего признаков:", test_df.shape[1])

**Приведение типов данных**

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

In [None]:
train_df['Id'] = train_df['Id'].astype(str)
train_df['DistrictId'] = train_df['DistrictId'].astype(str)
train_df['Rooms'] = train_df['Rooms'].astype(int)
train_df['HouseFloor'] = train_df['HouseFloor'].astype(int)
train_df['Ecology_1'] = train_df['Ecology_1'].astype(int)

test_df['Id'] = test_df['Id'].astype(str)
test_df['DistrictId'] = test_df['DistrictId'].astype(str)
test_df['Rooms'] = test_df['Rooms'].astype(int)
test_df['HouseFloor'] = test_df['HouseFloor'].astype(int)
test_df['Ecology_1'] = test_df['Ecology_1'].astype(int)

## ***Анализ данных***

1. **EDA** разведочный анализ данных

In [None]:
plt.figure(figsize = (12, 6))

train_df['Price'].hist(bins=40)
plt.ylabel('Count')
plt.xlabel('Price')

plt.title('Целевая переменная')
plt.show()

In [None]:
#количественные переменные
train_df.describe()

In [None]:
#номинативные переменные
train_df.select_dtypes(include='object').columns.tolist()

**2. Обработка выбросов**

In [None]:
train_df['Rooms'].value_counts()

In [None]:
#вводим новый признак, обозначающий выбросы среди квартир: "1" означает выброс, "0" не попадает под условия выброса
train_df['Rooms_outlier'] = 0
train_df.loc[(train_df['Rooms'] == 0) | (train_df['Rooms'] >= 7), 'Rooms_outlier'] = 1
train_df.head()

In [None]:
#заменим значение на медиану в случае, если в датасете число комнат равно нулю или больше или равно 7 и площадь
#квартиры при этом больше 100 кв.м. 
#Если же в датасете число комнат равно нулю или больше или равно 7 и площадь квартиры при этом 
#меньше или равно 100 кв.м., значение "Room" заменяем на 1.
train_df.loc[(train_df['Rooms'] == 0) & (train_df['Square'] <= 100), 'Rooms'] = 1
train_df.loc[(train_df['Rooms'] == 0) & (train_df['Square'] > 100), 'Rooms'] = train_df['Rooms'].median()
train_df.loc[(train_df['Rooms'] >= 7) & (train_df['Square'] <= 100), 'Rooms'] = 1
train_df.loc[(train_df['Rooms'] >= 7) & (train_df['Square'] > 100), 'Rooms'] = train_df['Rooms'].median()

In [None]:
train_df['Rooms'].value_counts()

In [None]:
train_df.loc[(train_df['Square'] <= 31) | (train_df['Square'] > 300)]

In [None]:
#вводим новый признак, обозначающий выбросы среди общей площади квартир: "1" означает выброс, "0" не попадает под условия выброса
train_df['Square_outlier'] = 0
train_df.loc[(train_df['Square'] <= 31) | (train_df['Square'] > 300), 'Square_outlier'] = 1
train_df.head()

In [None]:
train_df['Square'].quantile(.975), train_df['Square'].quantile(.025)

In [None]:
#заменим значение на квантили в случае, если в датасете общая площадь квартиры больше 300 кв.м. 
#и на среднее значение, если общая площадь меньше или равна 31 кв.м.
train_df.loc[train_df['Square'] <= 31, 'Square'] = train_df['Square'].mean()
train_df.loc[train_df['Square'] > 300, 'Square'] = train_df['Square'].quantile(.975)

In [None]:
train_df['Square'].value_counts()

In [None]:
train_df['KitchenSquare'].quantile(.800), train_df['KitchenSquare'].quantile(.200)

In [None]:
train_df['KitchenSquare'].quantile(.995), train_df['KitchenSquare'].quantile(.005)

In [None]:
#Заменим значения 'KitchenSquare' в датасете в зависимости от общей площади квартиры: чем меньше площадь квартиры,
#тем меньше площадь кухни
condition_1 = (train_df['KitchenSquare'].isna()) \
             | (train_df['KitchenSquare'] > train_df['KitchenSquare'].quantile(.995))

condition_2 = (train_df['Square'] <= 31) \
             | (train_df['Square'] <= 40)
                
condition_3 = (train_df['Square'] > 40) \
             | (train_df['Square'] <= train_df['Square'].mean())

condition_4 = (train_df['Square'] > train_df['Square'].mean()) \
             | (train_df['Square'] <= 90)
     
train_df.loc[condition_1, 'KitchenSquare'] = 20 ##quantile(.995)

train_df.loc[condition_2, 'KitchenSquare'] = train_df['KitchenSquare'].median()

train_df.loc[condition_3, 'KitchenSquare'] = 9  ##quantile(.800)

train_df.loc[condition_4, 'KitchenSquare'] = 13 ##quantile(.975)

train_df.loc[train_df['Square'] > 90, 'KitchenSquare'] = 20 ##quantile(.995)

train_df.loc[train_df['KitchenSquare'] < 5, 'KitchenSquare'] = 5

In [None]:
train_df['KitchenSquare'].value_counts()

In [None]:
#посмотрим, в домах какой этажности находятся квартиры в датасете
train_df['HouseFloor'].sort_values().unique()

In [None]:
#по состоянию на 04.08.2021 в самом высоком здании Москвы насчитывается 75 этажей, выбросами являются значения 0 и все больше 75
#таких выбросов всего 3, поэтому произведем замены их на средние значения без обозначения дополнительными признаками
train_df.loc[(train_df['HouseFloor'] == 0) | (train_df['HouseFloor'] > 75), 'HouseFloor'] = train_df['HouseFloor'].mean()

In [None]:
train_df['HouseFloor'].sort_values().unique()

In [None]:
train_df['HouseFloor'].value_counts()

In [None]:
#проверяем соответствие указаного этажа квартиры этажности дома, в котором расположена квартира, сколько квартир находятся на более
#высоком этаже, чем вместимость дома
(train_df['Floor'] > train_df['HouseFloor']).sum()

In [None]:
#вводим новый признак, обозначающий выбросы среди 'HouseFloor': "1" означает выброс, "0" не попадает под условия выброса
train_df['HouseFloor_outlier'] = 0
train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor_outlier'] = 1

In [None]:
#заменим значение 'HouseFloor' в случаях, если оно равно "0" или меньше указанного в датасете этажа на значение 'Floor'.
train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor'] = train_df['Floor']
train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor'] = train_df['Floor']

In [None]:
train_df['HouseFloor'].value_counts()

In [None]:
train_df['HouseYear'].sort_values(ascending=False)

In [None]:
train_df.loc[train_df['HouseYear'] > 2021, 'HouseYear'] = 2021
train_df.loc[train_df['HouseYear'] <= 1900, 'HouseYear'] = 1900


In [None]:
train_df['HouseYear'].value_counts()

**3. Обработка пропусков**

In [None]:
train_df[['Square', 'LifeSquare', 'KitchenSquare']].head(10)

In [None]:
train_df['LifeSquare_nan'] = train_df['LifeSquare'].isna() * 1

condition = (train_df['LifeSquare'].isna()) \
             & (~train_df['Square'].isna()) \
             & (~train_df['KitchenSquare'].isna())
        
train_df.loc[condition, 'LifeSquare'] = train_df.loc[condition, 'Square'] \
                                            - train_df.loc[condition, 'KitchenSquare'] - 10

In [None]:
train_df[['Square', 'LifeSquare', 'KitchenSquare']].tail(20)

In [None]:
#проверяем, есть ли превышение жилой площади над общей площадью квартиры; 
#сколько объектов с подобными данными
(train_df['LifeSquare'] >= train_df['Square']).sum()

In [None]:
#вводим новый признак, обозначающий выбросы среди 'LifeSquare': "1" означает выброс, "0" не попадает под условия выброса
train_df['LifeSquare_outlier'] = 0
train_df.loc[train_df['LifeSquare'] == 0, 'LifeSquare_outlier'] = 1
train_df.loc[train_df['LifeSquare'] >= train_df['Square'], 'LifeSquare_outlier'] = 1

In [None]:
train_df['LifeSquare'].quantile(.975), train_df['LifeSquare'].quantile(.025)

In [None]:
#заменим значение 'LifeSquare' в случаях, если оно меньше или равно квантили 025 и больше или равно общей площади квартиры
train_df.loc[train_df['LifeSquare'] <= train_df['LifeSquare'].quantile(.025), 'LifeSquare'] = train_df['LifeSquare'].quantile(.025)
train_df.loc[train_df['LifeSquare'] >= train_df['Square'], 'LifeSquare'] = train_df['LifeSquare'].quantile(.350)

In [None]:
(train_df['LifeSquare'] >= train_df['Square']).sum()

In [None]:
train_df['LifeSquare'].value_counts()

In [None]:
train_df['Healthcare_1'].sort_values().unique()

In [None]:
#вводим новый признак, обозначающий выбросы среди 'Healthcare_1': "1" означает выброс, "0" не попадает под условия выброса
train_df['Healthcare_1_outlier'] = 0
train_df.loc[train_df['Healthcare_1'] == 0, 'Healthcare_1_outlier'] = 1
train_df.loc[train_df['Healthcare_1'] >= 1000, 'Healthcare_1_outlier'] = 1

In [None]:
#заменим выбросы 'Healthcare_1' на значение медианы
train_df.loc[(train_df['Healthcare_1'] == 0) | (train_df['Healthcare_1'] >= 1000), 'Healthcare_1'] = train_df['Healthcare_1'].median()

In [None]:
train_df['Healthcare_1'].value_counts()

In [None]:
train_df['Healthcare_1'].quantile(.700), train_df['Healthcare_1'].quantile(.300)

In [None]:
#заполним пропуски значениями квантили 300
fill_Hc1 = train_df['Healthcare_1'].quantile(.300)

In [None]:
train_df['Healthcare_1'] = train_df['Healthcare_1'].fillna(fill_Hc1)

In [None]:
train_df['Healthcare_1'].value_counts()

In [None]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.means = None
        self.medians = None
        self.kitchensquare1_quantile = None
        self.kitchensquare2_quantile = None
        self.kitchensquare3_quantile = None
        self.square_quantile = None
        self.lifesquare1_quantile = None
        self.lifesquare2_quantile = None
        self.healthcare1_quantile = None
        
    def fit(self, X):
        """Сохранение статистик"""       
        # Расчет медиан
        self.medians = X.median()
        self.kitchensquare1_quantile = X['KitchenSquare'].quantile(.995)
        self.kitchensquare2_quantile = X['KitchenSquare'].quantile(.975)
        self.kitchensquare3_quantile = X['KitchenSquare'].quantile(.800)
        self.square_quantile = X['Square'].quantile(.975)
        self.lifesquare1_quantile = X['LifeSquare'].quantile(.350)
        self.lifesquare2_quantile = X['LifeSquare'].quantile(.025)
        self.healthcare1_quantile = X['Healthcare_1'].quantile(.300)
        
        # Расчет средних значений
        self.means = X.mean()
    
    def transform(self, X):
        """Трансформация данных"""

        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 7), 'Rooms_outlier'] = 1
        
        X.loc[(X['Rooms'] == 0) & (X['Square'] <= 100), 'Rooms'] = 1
        X.loc[(X['Rooms'] == 0) & (X['Square'] > 100), 'Rooms'] = self.medians['Rooms']
        X.loc[X['Rooms'] >= 7 & (X['Square'] <= 100), 'Rooms'] = 1
        X.loc[X['Rooms'] >= 7 & (X['Square'] > 100), 'Rooms'] = self.medians['Rooms'] 
        
        # Square
        X['Square_outlier'] = 0
        X.loc[(X['Square'] <= 31) | (X['Square'] > 300), 'Square_outlier'] = 1
        
        X.loc[X['Square'] <= 31, 'Square'] = self.means['Square']
        X.loc[X['Square'] > 300, 'Square'] = self.square_quantile
                
        # KitchenSquare
        condition_1 = (X['KitchenSquare'].isna()) \
                    | (X['KitchenSquare'] > self.kitchensquare1_quantile)
        condition_2 = (X['Square'] <= 31) \
                     | (X['Square'] <= 40)            
        condition_3 = (X['Square'] > 40) \
                     | (X['Square'] <= self.means['Square'])
        condition_4 = (X['Square'] > self.means['Square']) \
                     | (X['Square'] <= 90)
        
        X.loc[condition_1, 'KitchenSquare'] = self.kitchensquare1_quantile
        X.loc[condition_2, 'KitchenSquare'] = self.medians['KitchenSquare']
        X.loc[condition_3, 'KitchenSquare'] = self.kitchensquare3_quantile
        X.loc[condition_4, 'KitchenSquare'] = self.kitchensquare2_quantile
        X.loc[X['Square'] > 90, 'KitchenSquare'] = self.kitchensquare1_quantile
        X.loc[X['KitchenSquare'] < 5, 'KitchenSquare'] = 5
       
           
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1
        
        X.loc[X['HouseFloor'] == 0 | (X['HouseFloor'] > 75), 'HouseFloor'] = self.means['HouseFloor']        
             
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = X['Floor']
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor'] = X['Floor']
        
        # HouseYear
        current_year = datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1
        X.loc[X['HouseYear'] <= 1900, 'HouseYear_outlier'] = 1
        
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        X.loc[X['HouseYear'] <= 1900, 'HouseYear_outlier'] = 1900
        
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        condition = (X['LifeSquare'].isna()) & \
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 10
        
        X['LifeSquare_outlier'] = 0
        X.loc[X['LifeSquare'] == 0, 'LifeSquare_outlier'] = 1
        X.loc[X['LifeSquare'] >= X['Square'], 'LifeSquare_outlier'] = 1
        
        X.loc[X['LifeSquare'] <= self.lifesquare2_quantile, 'LifeSquare'] = self.lifesquare2_quantile
        X.loc[X['LifeSquare'] >= X['Square'], 'LifeSquare'] = self.lifesquare1_quantile 
        
        # Healthcare_1         
        X['Healthcare_1_outlier'] = 0
        X.loc[X['Healthcare_1'] == 0, 'Healthcare_1_outlier'] = 1
        X.loc[X['Healthcare_1'] >= 1000, 'Healthcare_1_outlier'] = 1
        
        X.loc[(X['Healthcare_1'] == 0) | (X['Healthcare_1'] >= 1000), 'Healthcare_1'] = self.medians['Healthcare_1']
        
        fill_Hc1 = self.healthcare1_quantile
        X['Healthcare_1'] = X['Healthcare_1'].fillna(fill_Hc1)
        
        
        X.fillna(self.medians, inplace=True)
        
        return X

**4. Построение новых признаков**

**Dummies**

In [None]:
#переводим строковые признаки в числовые
binary_to_numbers = {'A': 0, 'B': 1}

train_df['Ecology_2'] = train_df['Ecology_2'].replace(binary_to_numbers)
train_df['Ecology_3'] = train_df['Ecology_3'].replace(binary_to_numbers)
train_df['Shops_2'] = train_df['Shops_2'].replace(binary_to_numbers)

**DistrictSize, IsDistrictLarge**

In [None]:
#переводим в вещественный признак DistrictId
district_size = train_df['DistrictId'].value_counts().reset_index()\
                    .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})

district_size.head(7)

In [None]:
#присоединяем к train_df
train_df = train_df.merge(district_size, on='DistrictId', how='left')
train_df.head(7)

In [None]:
#добавим новый признак и разделим квартиры в зависисмости от размеров района
(train_df['DistrictSize'] > 100).value_counts()

In [None]:
train_df['IsDistrictLarge'] = (train_df['DistrictSize'] > 100).astype(int)

Расчет переменной в зависимости от количества комнат и района расположения квартиры - **M_Price_Room_dstr**

In [None]:
m_price_room_dstr = train_df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'mean'})\
                            .rename(columns={'Price':'M_Price_Room_dstr'})

m_price_room_dstr.head(7)

In [None]:
m_price_room_dstr.shape

In [None]:
#присоединяем к train_df
train_df = train_df.merge(m_price_room_dstr, on=['DistrictId', 'Rooms'], how='left')
train_df.head(7)

**M_PriceByFloorYea**r - добавим целевую переменную, включающую признаки этажности и года постройки

In [None]:
def floor_to_cat(X):

    X['floor_cat'] = 0

    X.loc[X['Floor'] <= 2, 'floor_cat'] = 1  
    X.loc[(X['Floor'] > 2) & (X['Floor'] <= 5), 'floor_cat'] = 2
    X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
    X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
    X.loc[X['Floor'] > 15, 'floor_cat'] = 5

    return X

def year_to_cat(X):

    X['year_cat'] = 0

    X.loc[X['HouseYear'] <= 1920, 'year_cat'] = 1
    X.loc[(X['HouseYear'] > 1920) & (X['HouseYear'] <= 1946), 'year_cat'] = 2
    X.loc[(X['HouseYear'] > 1946) & (X['HouseYear'] <= 1959), 'year_cat'] = 3
    X.loc[(X['HouseYear'] > 1960) & (X['HouseYear'] <= 1989), 'year_cat'] = 4
    X.loc[(X['HouseYear'] > 1989) & (X['HouseYear'] <= 2009), 'year_cat'] = 5
    X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6

    return X

In [None]:
bins = [0, 3, 5, 9, 15, train_df['Floor'].max()]
pd.cut(train_df['Floor'], bins=bins, labels=False)

In [None]:
train_df = year_to_cat(train_df)
train_df = floor_to_cat(train_df)
train_df.head()

In [None]:
m_price_by_floor_year = train_df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'mean'}).\
                                            rename(columns={'Price':'M_PriceByFloorYear'})
m_price_by_floor_year.head(7)

In [None]:
#присоединяем к train_df
train_df = train_df.merge(m_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
train_df.head(7)

In [None]:
class FeatureGenetator():
    """Генерация новых признаков"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.m_price_room_dstr = None
        self.m_price_by_floor_year = None
        self.house_year_max = None
        self.floor_max = None
        self.district_size = None
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # DistrictID
        self.district_size = X['DistrictId'].value_counts().reset_index() \
                               .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
                
        # Target encoding
        ## District, Rooms
        df = X.copy()
        
        if y is not None:
            df['Price'] = y.values
            
            self.m_price_room_dstr = df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'mean'})\
                                            .rename(columns={'Price':'M_Price_Room_dstr'})
            
            self.m_price_room_dstr_mean = self.m_price_room_dstr['M_Price_Room_dstr'].mean()
            
        ## floor, year
        if y is not None:
            self.floor_max = df['Floor'].max()
            self.house_year_max = df['HouseYear'].max()
            df['Price'] = y.values
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            self.m_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'mean'}).\
                                            rename(columns={'Price':'M_PriceByFloorYear'})
            self.m_price_by_floor_year_mean = self.m_price_by_floor_year['M_PriceByFloorYear'].mean()
        

        
    def transform(self, X):
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # DistrictId, IsDistrictLarge
        X = X.merge(self.district_size, on='DistrictId', how='left')
        
        X['new_district'] = 0
        X.loc[X['DistrictSize'].isna(), 'new_district'] = 1
        
        X['DistrictSize'].fillna(5, inplace=True)
        
        X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)
        
        # More categorical features
        X = self.floor_to_cat(X)  # + столбец floor_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        
        # Target encoding
        if self.m_price_room_dstr is not None:
            X = X.merge(self.m_price_room_dstr, on=['DistrictId', 'Rooms'], how='left')
            X['M_Price_Room_dstr'].fillna(self.m_price_room_dstr_mean, inplace=True)
            
        if self.m_price_by_floor_year is not None:
            X = X.merge(self.m_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            X['M_PriceByFloorYear'].fillna(self.m_price_by_floor_year_mean, inplace=True)
        
        return X
    
    def floor_to_cat(self, X):

        X['floor_cat'] = 0

        X.loc[X['Floor'] <= 2, 'floor_cat'] = 1  
        X.loc[(X['Floor'] > 2) & (X['Floor'] <= 5), 'floor_cat'] = 2
        X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
        X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
        X.loc[X['Floor'] > 15, 'floor_cat'] = 5

        return X

    def year_to_cat(self, X):

        X['year_cat'] = 0

        X.loc[X['HouseYear'] <= 1920, 'year_cat'] = 1
        X.loc[(X['HouseYear'] > 1920) & (X['HouseYear'] <= 1946), 'year_cat'] = 2
        X.loc[(X['HouseYear'] > 1946) & (X['HouseYear'] <= 1959), 'year_cat'] = 3
        X.loc[(X['HouseYear'] > 1960) & (X['HouseYear'] <= 1989), 'year_cat'] = 4
        X.loc[(X['HouseYear'] > 1989) & (X['HouseYear'] <= 2009), 'year_cat'] = 5
        X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6

        return X    

**5. Отбор признаков**

In [None]:
train_df.columns.tolist()

In [None]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',
                 'Helthcare_2', 'Shops_1', 'Shops_2']

new_feature_names = ['Rooms_outlier', 'Square_outlier','HouseFloor_outlier', 'HouseYear_outlier', 'LifeSquare_nan', 'LifeSquare_outlier', 'Healthcare_1_outlier',
                     'DistrictSize', 'new_district', 'IsDistrictLarge',  'M_Price_Room_dstr', 'M_PriceByFloorYear']

target_name = 'Price'

**6. Разбиение на train и test**

In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

X = train_df.drop(columns=target_name)
y = train_df[target_name]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=39)

In [None]:
preprocessor = DataPreprocessing()
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)
test_df = preprocessor.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [None]:
features_gen = FeatureGenetator()
features_gen.fit(X_train, y_train)

X_train = features_gen.transform(X_train)
X_valid = features_gen.transform(X_valid)
test_df = features_gen.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [None]:
X_train = X_train[feature_names + new_feature_names]
X_valid = X_valid[feature_names + new_feature_names]
test_df = test_df[feature_names + new_feature_names]

In [None]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()

**7. Построение модели**

**Обучение**

In [None]:
rf_model = RandomForestRegressor(random_state=39, criterion='mse')
rf_model.fit(X_train, y_train)

**Оценка модели**

In [None]:
y_train_preds = rf_model.predict(X_train)
y_test_preds = rf_model.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)

In [None]:
#проверяем на кросс-валидации
cv_score = cross_val_score(rf_model, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))
cv_score

In [None]:
cv_score.mean()

In [None]:
#важность признаков, проверка "полезности" признаков
feature_importances = pd.DataFrame(zip(X_train.columns, rf_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

In [None]:
from sklearn.ensemble import StackingRegressor, VotingRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
gb = GradientBoostingRegressor()

stack = StackingRegressor([('lr', lr), ('rf', rf_model)], final_estimator=gb)
stack.fit(X_train, y_train)

In [None]:
y_train_preds = stack.predict(X_train)
y_test_preds = stack.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)

**8. Прогнозирование на тестовом датасете**

In [None]:
test_df

In [None]:
submit = pd.read_csv('/kaggle/input/real-estate-price-prediction/sample_submission.csv')
submit.head()

In [None]:
predictions = rf_model.predict(test_df)
predictions

In [None]:
submit['Price'] = predictions
submit.head()

In [None]:
submit.to_csv('rf_submit.csv', index=False)