In [404]:
## import numpy as np
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor

import xgboost as xgb
import lightgbm as lgb

In [405]:
# загружаю данные
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [406]:
# проверяю размерности
print(f'train: {train.shape}, test: {test.shape}')

train: (10000, 20), test: (5000, 19)


## Обучение модели

In [407]:
# сменю индекс датафрейма на индекс квартиры
# предварительно проверю, все ли значения уникальны
len(train['Id'].unique())
train = train.set_index('Id')

#### Обработка признаков

In [408]:
# оцениваю датасет по признакам
print(train.columns)
train.head()

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
       'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
       'Shops_1', 'Shops_2', 'Price'],
      dtype='object')


Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [409]:
# разделаю цену и набор признаков, цену кладу из Series в DataFrame
y = pd.DataFrame(data=train['Price'], columns=['Price'])
X = train.drop('Price', axis=1)

In [410]:
# проверяю цену по наполнению
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 14038 to 6306
Data columns (total 1 columns):
Price    10000 non-null float64
dtypes: float64(1)
memory usage: 156.2 KB


In [411]:
# проверяю признаки по наполнению
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 14038 to 6306
Data columns (total 18 columns):
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
dtypes: float64(7), int64(8), object(3)
memory usage: 1.4+ MB


In [412]:
# ищу аномалии
X.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0
mean,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313
std,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341
min,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0
50%,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0
75%,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0
max,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


##### нормализирую признак DistrictId

In [413]:
# хоть и числовое значение, признак является категориальным
# => перевожу его в категориальный вид
#X['DistrictId'] = X['DistrictId'].astype('category')

##### нормализирую признак Rooms

In [414]:
X['Rooms'].unique()

array([ 2.,  3.,  1.,  4., 10.,  0.,  5., 19.,  6.])

In [415]:
# начинаю с нулей, смотрю наблюдения с этим значением признака
X.loc[X['Rooms'] == 0]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
12638,27,0.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,B,B,11,3097,0,,0,0,B
7917,27,0.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,,0,1,B
7317,27,0.0,41.790881,,0.0,13,0.0,1977,0.211401,B,B,9,1892,0,,0,1,B
770,28,0.0,49.483501,,0.0,16,0.0,2015,0.118537,B,B,30,6207,1,1183.0,1,0,B
456,6,0.0,81.491446,,0.0,4,0.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B
3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,,0,0,B
3159,88,0.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,B,B,43,8429,3,,3,9,B
9443,27,0.0,87.762616,85.125471,0.0,5,15.0,1977,0.211401,B,B,9,1892,0,,0,1,B


In [416]:
# заполню средними значениями признака Rooms по признаку Square
# для этого узнаю, какие средние значения для известных значений признака Rooms 
print('среднее значение для кол-ва комнат 1:', X.loc[X['Rooms'] == 1].mean()['Square'])
print('среднее значение для кол-ва комнат 2:', X.loc[X['Rooms'] == 2].mean()['Square'])
print('среднее значение для кол-ва комнат 3:', X.loc[X['Rooms'] == 3].mean()['Square'])
print('среднее значение для кол-ва комнат 4:', X.loc[X['Rooms'] == 4].mean()['Square'])
print('среднее значение для кол-ва комнат 5:', X.loc[X['Rooms'] == 5].mean()['Square'])
print('среднее значение для кол-ва комнат 6:', X.loc[X['Rooms'] == 6].mean()['Square'])

среднее значение для кол-ва комнат 1: 41.32327748796116
среднее значение для кол-ва комнат 2: 56.78821360752913
среднее значение для кол-ва комнат 3: 76.90323406052026
среднее значение для кол-ва комнат 4: 98.37754356017346
среднее значение для кол-ва комнат 5: 122.61494139178704
среднее значение для кол-ва комнат 6: 59.41433379627719


In [417]:
X.columns

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
       'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2',
       'Shops_1', 'Shops_2'],
      dtype='object')

In [418]:
# 6-комнатная похожа на выброс, чекаю, действительно так
X.loc[X['Rooms'] == 6]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,B,B,66,10573,1,1322.0,3,8,B


In [419]:
# сразу поправлю кол-во комнат для этого Id по среднему Square
X.loc[X.index == 14003, 'Rooms'] = 2

In [420]:
# возвращаюсь к нулевым значеним 
# по-хорошему надо придумать правило заполнения, но не успеваю до сдачи 
X.loc[X.index == 12638, 'Rooms'] = 5
X.loc[X.index == 7917, 'Rooms'] = 5
X.loc[X.index == 7317, 'Rooms'] = 1
X.loc[X.index == 770, 'Rooms'] = 1
X.loc[X.index == 456, 'Rooms'] = 3
X.loc[X.index == 3159, 'Rooms'] = 1
X.loc[X.index == 9443, 'Rooms'] = 4
# есть одна аномалия, для которой сложно предположить что либо, поэтому удалю ее:
X = X.drop(3224)
y = y.drop(3224)

In [421]:
# далее смотрю количество комнат 10 и 16
X.loc[X['Rooms'] >= 10]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,B,B,74,19083,2,,5,15,B
8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,B,B,1,264,0,,0,1,B
14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,B,B,25,5648,1,30.0,2,4,B


In [422]:
# предполагаю, что кол-во комнат для такого метража 1, а 2ая цифра была опечаткой
i = [5927, 8491, 14865]
for i in i:
    X.loc[X.index == i, 'Rooms'] = 1

##### нормализирую признак Square

In [423]:
X.loc[X['Square'] <10]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1748,88,2.0,5.497061,67.628717,1.0,24,22.0,1977,0.127376,B,B,43,8429,3,,3,9,B
11526,27,1.0,4.633498,1.969969,1.0,18,1.0,1977,0.011654,B,B,4,915,0,,0,0,B
10202,6,1.0,2.596351,4.604943,1.0,3,25.0,2014,0.243205,B,B,5,1564,0,540.0,0,0,B
10527,27,1.0,4.380726,40.805837,1.0,10,17.0,2013,0.211401,B,B,9,1892,0,,0,1,B
9487,5,1.0,5.129222,5.549458,1.0,1,1.0,1977,0.150818,B,B,16,3433,4,2643.0,4,5,B
4504,27,3.0,4.390331,5.610772,1.0,8,19.0,2016,0.211401,B,B,9,1892,0,,0,1,B
14786,1,1.0,1.136859,4.525736,1.0,3,1.0,1977,0.007122,B,B,1,264,0,,0,1,B
13265,1,3.0,4.823679,79.767964,0.0,6,17.0,1977,0.007122,B,B,1,264,0,,0,1,B
15744,34,1.0,1.988943,2.642219,1.0,21,4.0,1977,0.069753,B,B,53,13670,4,,1,11,B
6782,45,1.0,2.954309,5.257278,1.0,3,1.0,1977,0.195781,B,B,23,5212,6,,3,2,B


In [424]:
X.loc[X.index == 1748, 'Square'] = X.loc[X.index == 1748, 'Square'] * 10
X.loc[X.index == 10527, 'Square'] = X.loc[X.index == 10527, 'Square'] * 10
X.loc[X.index == 10527, 'Square'] = X.loc[X.index == 10527, 'Square'] * 10

##### нормализирую признак LifeSquare

In [425]:
X['Rooms'].unique()

array([2., 3., 1., 4., 5.])

In [426]:
# признак LifeSquare имеет пропущенные значения, заполняю средним
X['LifeSquare'] = X['LifeSquare'].fillna(X['LifeSquare'].mean())

In [427]:
# пробовал вывести среднее по кол-ву комнат, заполнить пропущенным в зависимости от комнат
# но метрика оказалась намного хуже

#room_mean_1 = X.loc[(X['Rooms'] == i)].mean()['LifeSquare']
#rooms = [1, 2, 3, 4, 5]
#for i in rooms:
#X.loc[(X['Rooms'] == i), 'LifeSquare'] = X.loc[(X['Rooms'] == i)]['LifeSquare'].fillna(X.loc[(X['Rooms'] == i)].mean()['LifeSquare'])
#X.info()

In [428]:
# проверяю экстремальные выбросы
X.loc[X['LifeSquare'] > 200]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
14990,23,2.0,48.449873,263.54202,5.0,6,5.0,1972,0.075779,B,B,6,1437,3,,0,2,B
7917,27,5.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,,0,1,B
5548,86,5.0,275.645284,233.949309,26.0,12,37.0,2011,0.161976,B,A,31,7010,5,4508.0,3,7,B
16550,27,3.0,81.694417,7480.592129,1.0,9,17.0,2016,0.017647,B,B,2,469,0,,0,0,B
2307,102,1.0,409.425181,410.639749,10.0,4,4.0,2016,0.238617,B,B,26,3889,6,705.0,3,6,B
11602,30,2.0,641.065193,638.163193,10.0,20,19.0,2019,7.8e-05,B,B,22,6398,141,1046.0,3,23,B
15886,85,3.0,78.059331,461.463614,10.0,12,16.0,1998,0.037178,B,B,52,11217,1,2300.0,1,7,B
16568,27,4.0,200.334539,201.627361,25.0,1,2.0,2013,0.041116,B,B,53,14892,4,,1,4,B


In [429]:
X.loc[X.index == 14990, 'LifeSquare'] = X.loc[X.index == 14990, 'LifeSquare'] / 10
X.loc[X.index == 16550, 'LifeSquare'] = X.loc[X.index == 16550, 'LifeSquare'] / 100
X.loc[X.index == 15886, 'LifeSquare'] = X.loc[X.index == 15886, 'LifeSquare'] / 10

##### нормализирую признак KitchenSquare

In [430]:
X['KitchenSquare'].value_counts()

1.0       2460
8.0       1306
5.0       1169
10.0      1075
6.0       1038
9.0        843
0.0        696
7.0        609
12.0       249
11.0       233
13.0        67
14.0        51
4.0         39
15.0        31
3.0         22
16.0        16
20.0        14
17.0        12
19.0        11
18.0         6
2.0          4
22.0         3
41.0         2
112.0        2
25.0         2
43.0         2
51.0         2
37.0         2
32.0         2
30.0         2
58.0         2
72.0         1
96.0         1
66.0         1
48.0         1
40.0         1
2014.0       1
35.0         1
60.0         1
78.0         1
27.0         1
84.0         1
62.0         1
42.0         1
63.0         1
39.0         1
1970.0       1
36.0         1
75.0         1
26.0         1
21.0         1
29.0         1
23.0         1
73.0         1
123.0        1
31.0         1
53.0         1
54.0         1
Name: KitchenSquare, dtype: int64

In [431]:
# смотрю аномалии
X.loc[(X['KitchenSquare'] > 50)]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
16395,2,3.0,79.722243,44.731219,72.0,12,16.0,1987,0.130618,B,B,39,10418,9,900.0,1,9,B
14656,62,1.0,47.100719,46.44796,2014.0,4,1.0,2014,0.072158,B,B,2,629,1,,0,0,A
2371,27,2.0,68.841073,64.234956,66.0,4,2.0,2014,0.017647,B,B,2,469,0,,0,0,B
12507,54,2.0,79.810535,79.578961,78.0,10,15.0,2014,0.006076,B,B,30,5285,0,645.0,6,6,B
4265,161,2.0,53.216778,32.644859,53.0,7,17.0,1994,0.000699,B,B,14,3369,24,4129.0,0,3,B
12390,72,3.0,97.490674,99.323558,96.0,22,25.0,2019,0.210473,B,B,11,2398,2,1994.0,3,0,B
7441,62,3.0,114.734473,112.589083,112.0,3,3.0,2015,0.072158,B,B,2,629,1,,0,0,A
6508,23,2.0,67.146049,33.959154,63.0,5,17.0,2019,0.034656,B,B,0,168,0,,0,0,B
299,27,2.0,66.787523,64.616662,60.0,14,20.0,2015,0.017647,B,B,2,469,0,,0,0,B
12552,58,3.0,116.405693,113.109653,112.0,3,3.0,2016,0.437885,B,B,23,5735,3,1084.0,0,5,B


In [432]:
# опечатка, заменю средним по комнатам
X.loc[X.index == 14656, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
X.loc[X.index == 14679, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
# думаю, слишком большой метраж кухни, заменю средним по комнатам
X.loc[X.index == 16395, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
X.loc[X.index == 2371, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 12507, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 4265, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 12390, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
X.loc[X.index == 7441, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
X.loc[X.index == 6508, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 299, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 12552, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
X.loc[X.index == 13703, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
X.loc[X.index == 11739, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 16593, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 673, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
X.loc[X.index == 12666, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 4966, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()
X.loc[X.index == 6569, 'KitchenSquare'] = X.loc[X['Rooms'] == 1, 'KitchenSquare'].mean()
X.loc[X.index == 7162, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 12918, 'KitchenSquare'] = X.loc[X['Rooms'] == 2, 'KitchenSquare'].mean()
X.loc[X.index == 2737, 'KitchenSquare'] = X.loc[X['Rooms'] == 3, 'KitchenSquare'].mean()

#X.loc[(X['KitchenSquare'] > 50), 'KitchenSquare'] = X.loc[(X['KitchenSquare'] > 50), 'KitchenSquare']/10


##### нормализирую признак HouseFloor

In [433]:
# смотрю аномалии
X.loc[X['HouseFloor'] > 50]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10806,5,1.0,51.944587,48.709601,1.0,6,99.0,1977,0.150818,B,B,16,3433,4,2643.0,4,5,B
9300,74,2.0,71.747869,74.579809,9.0,5,99.0,1977,0.075779,B,B,6,1437,3,,0,2,B
78,30,2.0,65.773749,66.811789,1.0,8,117.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B


In [434]:
# в 1977 году маловероятно, что существовали жилые здания с этажностью 99 и 117
X.loc[X.index == 10806, 'HouseFloor'] = X.loc[X['HouseYear'] == 1977, 'HouseFloor'].mean()
X.loc[X.index == 9300, 'HouseFloor'] = X.loc[X['HouseYear'] == 1977, 'HouseFloor'].mean()
X.loc[X.index == 78, 'HouseFloor'] = X.loc[X['HouseYear'] == 1977, 'HouseFloor'].mean()

##### нормализирую признак HouseYear

In [435]:
# смотрю аномалии
X.loc[(X['HouseYear'] > 2020)]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10814,109,1.0,37.26507,20.239714,9.0,9,12.0,20052011,0.13633,B,B,30,6141,10,262.0,3,6,B
11607,147,2.0,44.791836,28.360393,5.0,4,9.0,4968,0.319809,B,B,25,4756,16,2857.0,5,8,B


In [436]:
# поправляю опечатки
X.loc[X.index == 10814, 'HouseYear'] = 2011
X.loc[X.index == 11607, 'HouseYear'] = 1968

##### нормализирую признак Ecology_2, Ecology_3

In [437]:
# перевожу категориальный признак в численный 
X['Ecology_2'].value_counts()

B    9902
A      97
Name: Ecology_2, dtype: int64

In [438]:
X['Ecology_2'] = (X['Ecology_2'] == 'B').astype(int)

In [439]:
X['Ecology_2'].value_counts()

1    9902
0      97
Name: Ecology_2, dtype: int64

In [440]:
# перевожу категориальный признак в численный 
X['Ecology_3'].value_counts()

B    9724
A     275
Name: Ecology_3, dtype: int64

In [441]:
X['Ecology_3'] = (X['Ecology_3'] == 'B').astype(int)

In [442]:
X['Ecology_3'].value_counts()

1    9724
0     275
Name: Ecology_3, dtype: int64

##### нормализирую признак Social_3

In [443]:
# перевожу категориальный признак в численный, наполняю dummies
X['Social_3'] = X['Social_3'].astype('category')

##### нормализирую признак Healthcare_1

In [444]:
# заполняю пропущенные средним
X['Healthcare_1'] = X['Healthcare_1'].fillna(X['Healthcare_1'].mean())

In [445]:
# я не знаю смысл признака Helthcare_2, попробовал его как категорийный, метрика ухудшилась
# оставлю как количественный 
#X['Helthcare_2'] = X['Helthcare_2'].astype('category')
#X = pd.get_dummies(X)

##### нормализирую признак Shops_2

In [446]:
# перевожу категориальный признак в численный 
X['Shops_2'].value_counts()

B    9174
A     825
Name: Shops_2, dtype: int64

In [447]:
X['Shops_2'] = (X['Shops_2'] == 'B').astype(int)

In [448]:
X['Shops_2'].value_counts()

1    9174
0     825
Name: Shops_2, dtype: int64

In [449]:
# теперь заполняю dummies
X = pd.get_dummies(X)

In [450]:
X.columns

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
       'Social_1', 'Social_2', 'Healthcare_1', 'Helthcare_2', 'Shops_1',
       'Shops_2', 'Social_3_0', 'Social_3_1', 'Social_3_2', 'Social_3_3',
       'Social_3_4', 'Social_3_5', 'Social_3_6', 'Social_3_7', 'Social_3_8',
       'Social_3_9', 'Social_3_10', 'Social_3_11', 'Social_3_14',
       'Social_3_16', 'Social_3_19', 'Social_3_20', 'Social_3_23',
       'Social_3_24', 'Social_3_27', 'Social_3_37', 'Social_3_39',
       'Social_3_45', 'Social_3_48', 'Social_3_56', 'Social_3_59',
       'Social_3_73', 'Social_3_84', 'Social_3_87', 'Social_3_93',
       'Social_3_141'],
      dtype='object')

#### Подготовка формы для модели

In [451]:
def create_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    r2 = r2_score(y_test, y_pred) 
    print(f'r2 = {r2}')

#### Обучение модели на train dataset на линейной регрессии

In [452]:
# разбиваю train на тренировочный и тестовый датасеты
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [453]:
model = LinearRegression()

In [454]:
create_model(X_train, y_train, X_valid, y_valid, model)

r2 = 0.5476227208250088


#### Нормализация признаков 

In [455]:
scaler = StandardScaler()

In [456]:
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [457]:
X_valid_scaled = scaler.transform(X_valid)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)

  """Entry point for launching an IPython kernel.


In [458]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model) 

r2 = 0.5476488047651473


#### Lasso, Ridge

In [459]:
model = Lasso(alpha=100) 

In [460]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model)

r2 = 0.54807219552617


In [461]:
model = Ridge(alpha=0.1)

In [462]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model)

r2 = 0.5476241067835449


#### Decision Tree / Random Forest

In [463]:
model = RandomForestRegressor(max_depth=400, random_state=42, n_estimators=1000)

In [464]:
y_train = y_train.values.ravel()

In [465]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model) 

r2 = 0.7201805535832249


#### ExtraTreesRegressor

In [466]:
model = ExtraTreesRegressor(max_depth=400, random_state=42, n_estimators=1000)

In [467]:
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model) 

r2 = 0.7194537565725543


#### XGBoost
Хотя мы и не проходили, применю XGBoost, параметры подбирал вручную, т.к. мало времени было сделать нормальную сетку.

In [468]:
boost = xgb.XGBRegressor(colsample_bytree=0.5, gamma=0.0, 
                             learning_rate=0.005, max_depth=7, 
                             min_child_weight=0.5, n_estimators=5800,
                             reg_alpha=0.9, reg_lambda=0.99,
                             subsample=0.99,seed=42, silent=1,
                             random_state=42)

In [469]:
%%time
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, boost)

r2 = 0.7469975831366353
Wall time: 41.5 s


#### LGBoost
Попробую еще одну модель - она быстрее

In [470]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.1, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =10, min_sum_hessian_in_leaf = 11)

In [471]:
%%time
create_model(X_train_scaled, y_train, X_valid_scaled, y_valid, model_lgb)

r2 = 0.7173347182002301
Wall time: 329 ms


##### итого - наилучший результат у XGBoost, его и используем для предсказания

## Предсказание модели

далее - по аналогии с обучением - очищаю признаки и применяю модель XGBoost, показавшую лучшую метрику

In [472]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [473]:
test = test.set_index('Id')

In [474]:
# проверяю признаки по наполнению
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 725 to 12504
Data columns (total 18 columns):
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(8), object(3)
memory usage: 742.2+ KB


In [475]:
# ищу аномалии
test.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,51.2792,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [476]:
#test['DistrictId'] = test['DistrictId'].astype('category')

In [477]:
test['Rooms'].unique()

array([ 2.,  1.,  3.,  4.,  5.,  6.,  0., 17.])

In [478]:
test.loc[test['Rooms'] == 0]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3343,58,0.0,116.824201,113.692424,0.0,3,3.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B
10729,27,0.0,76.345154,42.820796,12.0,14,0.0,1977,0.017647,B,B,2,469,0,,0,0,B


In [479]:
# заменяю средними из датасета train

In [480]:
test.loc[test.index == 3343, 'Rooms'] = 5
test.loc[test.index == 10729, 'Rooms'] = 3

In [481]:
# в этом случае 6-комнатные выглядят нормально
test.loc[test['Rooms'] == 6]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10793,23,6.0,110.750226,,0.0,2,2.0,2015,0.014073,B,B,2,475,0,,0,0,B
4058,27,6.0,223.453689,104.113552,16.0,2,2.0,2017,0.041116,B,B,53,14892,4,,1,4,B


In [482]:
test.loc[test['Rooms'] == 17]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,B,B,23,4635,5,3300.0,2,4,B


In [483]:
test.loc[test.index == 1435, 'Rooms'] = 2

In [484]:
test.loc[test['Rooms'] == 0]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


In [485]:
test['LifeSquare'] = test['LifeSquare'].fillna(test['LifeSquare'].mean())

In [486]:
test.loc[test['LifeSquare'] > 200]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
11533,94,2.0,48.713443,303.071094,6.0,5,12.0,1974,0.521867,B,B,25,6149,0,,0,0,B


In [487]:
test.loc[test.index == 11533, 'LifeSquare'] = test.loc[test.index == 11533, 'LifeSquare'] / 10

In [488]:
test.loc[(test['KitchenSquare'] > 50)]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5428,27,2.0,62.326044,36.15881,61.0,12,17.0,1977,0.072158,B,B,2,629,1,,0,0,A
5260,73,3.0,69.358242,51.247581,65.0,6,6.0,1931,0.042032,B,B,37,6856,84,1940.0,2,5,B
3341,62,3.0,112.114019,112.247841,112.0,3,3.0,2017,0.072158,B,B,2,629,1,,0,0,A
14594,11,2.0,42.795304,24.22377,620.0,11,14.0,1972,0.038693,B,B,28,6533,1,1015.0,2,5,B
12612,27,2.0,60.988496,33.646726,60.0,5,17.0,2013,0.072158,B,B,2,629,1,,0,0,A
8015,27,1.0,66.099096,33.639611,62.0,3,7.0,2016,0.014058,B,B,1,290,0,,0,0,B
5199,27,2.0,59.05499,61.647531,57.0,13,12.0,2016,0.211401,B,B,9,1892,0,,0,1,B
12640,6,2.0,54.629142,31.486308,97.0,4,17.0,2015,0.243205,B,B,5,1564,0,540.0,0,0,B


In [489]:
test.loc[test.index == 14594, 'KitchenSquare'] = test.loc[test.index == 14594, 'KitchenSquare'] / 100
test.loc[(test['KitchenSquare'] > 50), 'KitchenSquare'] = test.loc[(test['KitchenSquare'] > 50), 'KitchenSquare']/10

In [490]:
test.loc[test['HouseFloor'] > 50]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
15864,27,3.0,47.722835,47.098813,9.0,18,99.0,1977,0.072158,B,B,2,629,1,,0,0,A


In [491]:
test.loc[test.index == 15864, 'HouseFloor'] = test.loc[test['HouseYear'] == 1977, 'HouseFloor'].mean()

In [492]:
test.loc[(test['HouseYear'] > 2020)]

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


In [493]:
test['Ecology_2'] = (test['Ecology_2'] == 'B').astype(int)
test['Ecology_2'].value_counts()

1    4952
0      48
Name: Ecology_2, dtype: int64

In [494]:
test['Ecology_3'] = (test['Ecology_3'] == 'B').astype(int)
test['Ecology_3'].value_counts()

1    4851
0     149
Name: Ecology_3, dtype: int64

In [495]:
test['Social_3'] = test['Social_3'].astype('category')

In [496]:
test['Healthcare_1'] = test['Healthcare_1'].fillna(test['Healthcare_1'].mean())

In [497]:
test['Shops_2'] = (test['Shops_2'] == 'B').astype(int)
test['Shops_2'].value_counts()

1    4588
0     412
Name: Shops_2, dtype: int64

In [498]:
test = pd.get_dummies(test)

In [499]:
test.columns

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
       'Social_1', 'Social_2', 'Healthcare_1', 'Helthcare_2', 'Shops_1',
       'Shops_2', 'Social_3_0', 'Social_3_1', 'Social_3_2', 'Social_3_3',
       'Social_3_4', 'Social_3_5', 'Social_3_6', 'Social_3_7', 'Social_3_8',
       'Social_3_9', 'Social_3_10', 'Social_3_11', 'Social_3_14',
       'Social_3_16', 'Social_3_19', 'Social_3_20', 'Social_3_23',
       'Social_3_24', 'Social_3_27', 'Social_3_37', 'Social_3_39',
       'Social_3_45', 'Social_3_48', 'Social_3_56', 'Social_3_59',
       'Social_3_73', 'Social_3_84', 'Social_3_87', 'Social_3_93',
       'Social_3_141'],
      dtype='object')

In [500]:
test_scaled = scaler.fit_transform(test)
test_scaled = pd.DataFrame(test_scaled, columns=test.columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [501]:
%%time
y_result = boost.predict(test_scaled)

Wall time: 4.46 s


In [502]:
submit = pd.DataFrame(list(zip(test.index,y_result)), columns = ['Id', 'Predicted_price'])

In [503]:
submit.shape

(5000, 2)

In [504]:
submit.head(30)

Unnamed: 0,Id,Predicted_price
0,725,157575.6875
1,15856,222225.09375
2,5480,297877.5625
3,15664,363360.3125
4,14275,143862.71875
5,7633,211490.453125
6,13329,175857.8125
7,5502,226078.078125
8,4220,285359.71875
9,11538,184513.84375


In [505]:
submit.to_csv('GKabanov_predictions.csv', sep=',', index=False)