In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
#загрузка датасета
TRAIN_DATASET = 'Desktop/Artyom_DS/Project/train.csv'
TEST_DATASET = 'Desktop/Artyom_DS/Project/test.csv'

# Начинаю работу с train

In [3]:
#чтение, демонстрация и подсчет количества столбцов/строк датасета "train"
train_df = pd.read_csv(TRAIN_DATASET)
display(train_df.head())
print(train_df.shape)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


(10000, 20)


In [4]:
#просмотр типов данных
train_df.dtypes

Id                 int64
DistrictId         int64
Rooms            float64
Square           float64
LifeSquare       float64
KitchenSquare    float64
Floor              int64
HouseFloor       float64
HouseYear          int64
Ecology_1        float64
Ecology_2         object
Ecology_3         object
Social_1           int64
Social_2           int64
Social_3           int64
Healthcare_1     float64
Helthcare_2        int64
Shops_1            int64
Shops_2           object
Price            float64
dtype: object

In [5]:
#смотрю только количественные переменные
train_df_num_features = train_df.select_dtypes(include=['float64', 'int64'])
train_df_num_features.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,33,7976,5,,0,11,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,46,10309,1,240.0,1,16,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,7759,0,229.0,1,3,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,5735,3,1084.0,0,5,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,5776,1,2078.0,2,4,150226.531644


In [6]:
#корреляция
train_df_num_features.corr()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
Id,1.0,0.012973,-0.005847,-0.010071,0.018449,0.01988,0.001348,-0.008376,0.005004,0.018097,-0.000772,-0.002033,-0.009358,-0.003879,0.001502,-0.008718,0.00988
DistrictId,0.012973,1.0,0.071432,-0.026613,-0.01991,0.040358,-0.120373,-0.149051,0.01343,0.065294,0.246463,0.167479,0.136095,0.304335,0.306147,0.174214,0.2651
Rooms,-0.005847,0.071432,1.0,0.662893,0.131336,0.005123,-0.000665,-0.029302,-0.010612,-0.032347,0.07598,0.071335,0.012811,0.042857,0.063557,0.053618,0.550291
Square,-0.010071,-0.026613,0.662893,1.0,0.196129,0.00832,0.114791,0.081505,-0.009032,-0.064479,-0.07069,-0.04312,0.035241,-0.039748,-0.02296,0.021357,0.520075
LifeSquare,0.018449,-0.01991,0.131336,0.196129,1.0,-0.001195,0.024559,0.027442,-0.00221,-0.023629,-0.048363,-0.039283,0.012763,-0.026867,-0.024762,-0.007569,0.081292
KitchenSquare,0.01988,0.040358,0.005123,0.00832,-0.001195,1.0,-0.011397,0.00078,0.000958,-0.005622,0.043379,0.037805,-0.01514,0.009472,0.04317,0.010216,0.028864
Floor,0.001348,-0.120373,-0.000665,0.114791,0.024559,-0.011397,1.0,0.418986,0.000928,-0.016133,-0.044914,-0.01656,-0.002237,-0.138294,-0.065537,0.024264,0.128715
HouseFloor,-0.008376,-0.149051,-0.029302,0.081505,0.027442,0.00078,0.418986,1.0,-0.000864,-0.004362,-0.020801,0.007194,-0.008137,-0.143973,-0.068728,0.026279,0.08828
HouseYear,0.005004,0.01343,-0.010612,-0.009032,-0.00221,0.000958,0.000928,-0.000864,1.0,0.001465,0.003026,0.00197,0.000819,-0.011969,0.011245,0.003681,0.004305
Ecology_1,0.018097,0.065294,-0.032347,-0.064479,-0.023629,-0.005622,-0.016133,-0.004362,0.001465,1.0,0.026464,0.009264,-0.124068,-0.043547,0.030873,-0.076749,-0.058381


In [7]:
#приведение столбца DistrictId к типу object
train_df['DistrictId'] = train_df['DistrictId'].astype(str)

In [8]:
#анализирую данные
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,10000.0,8383.4077,4859.01902,0.0,4169.5,8394.5,12592.5,16798.0
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,7887.0,37.199645,86.241209,0.370619,22.769832,32.78126,45.128803,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671
Social_1,10000.0,24.687,17.532614,0.0,6.0,25.0,36.0,74.0


In [9]:
#смотрю типы комнат и их количество в train
train_df['Rooms'].value_counts()

2.0     3880
1.0     3705
3.0     2235
4.0      150
5.0       18
0.0        8
10.0       2
19.0       1
6.0        1
Name: Rooms, dtype: int64

In [10]:
#заменяю "странное" количество комнат на медиану
train_df.loc[train_df['Rooms'].isin([0, 10, 19]), 'Rooms'] = train_df['Rooms'].median()
#теперь двушек стало чуть больше
train_df['Rooms'].value_counts()

2.0    3891
1.0    3705
3.0    2235
4.0     150
5.0      18
6.0       1
Name: Rooms, dtype: int64

In [11]:
#работаю с площадью
train_df.loc[train_df['LifeSquare'] > train_df['Square'], 'LifeSquare'] = train_df['Square'].mean() / 1.55
train_df.loc[train_df['KitchenSquare'] > train_df['LifeSquare'], 'KitchenSquare'] = train_df['Square'].mean() / 8.35
train_df.loc[train_df['KitchenSquare'] > train_df['Square'], 'KitchenSquare'] = train_df['Square'].mean() / 8.35


train_df.loc[(train_df['LifeSquare'].isnull()) | (train_df['LifeSquare'] > train_df['LifeSquare'].quantile(.98)) |
             (train_df['LifeSquare'] < train_df['LifeSquare'].quantile(.02)), 'LifeSquare'] = train_df['LifeSquare'].median()

In [12]:
#заменяю подозрительные года на логичные
train_df.loc[train_df['HouseYear'] == 20052011, 'HouseYear'] = 2011
train_df.loc[train_df['HouseYear'] == 4968, 'HouseYear'] = 1968

#заменил нулевые этажи на медиану
train_df.loc[train_df['HouseFloor'].isin([0]), 'HouseFloor'] = train_df['HouseFloor'].median()

#меняю этаж
train_df.loc[train_df['HouseFloor'] == 117, 'HouseFloor'] = 17

In [13]:
#смотрю количество пропусков в данных
train_df.isnull().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare          0
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1     4798
Helthcare_2         0
Shops_1             0
Shops_2             0
Price               0
dtype: int64

In [14]:
#заменяю категориальные данные на числовые
train_df['Ecology_2_bin'] = train_df['Ecology_2'].replace({'A':0, 'B':1})
train_df['Ecology_3_bin'] = train_df['Ecology_3'].replace({'A':0, 'B':1})
train_df['Shops_2_bin'] = train_df['Shops_2'].replace({'A':0, 'B':1})

In [15]:
#проверил замену на примере Ecology_2
# pd.get_dummies(train_df['Ecology_2']).head()

In [16]:
#удалил ненужные столбцы
train_df.drop(['DistrictId', 'Ecology_2', 'Ecology_3', 'Healthcare_1', 'Shops_2'], axis=1) 

Unnamed: 0,Id,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Price,Ecology_2_bin,Ecology_3_bin,Shops_2_bin
0,14038,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,33,7976,5,0,11,184966.930730,1,1,1
1,15053,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,46,10309,1,1,16,300009.450063,1,1,1
2,4765,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,7759,0,1,3,220925.908524,1,1,1
3,5809,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,5735,3,0,5,175616.227217,1,1,1
4,10783,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,5776,1,2,4,150226.531644,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,77,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.135650,46,7960,6,3,11,196684.316040,1,1,1
9996,6159,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,30,5562,0,0,5,189050.289571,1,1,0
9997,5123,1.0,47.939008,32.840142,1.0,12,16.0,2015,0.072158,2,629,1,0,0,159143.805370,1,1,0
9998,5400,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,30,5048,9,2,5,181595.339808,1,0,1


# Начинаю работу с test

In [17]:
#чтение, демонстрация и подсчет количества столбцов/строк датасета "test"
test_df = pd.read_csv(TEST_DATASET)
display(test_df.tail())
print(test_df.shape)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
4995,8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.00017,B,B,36,5992,0,,1,1,B
4996,4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,B,B,1,264,0,,0,1,B
4997,5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,B,B,74,19083,2,,5,15,B
4998,4780,62,2.0,81.305222,,0.0,4,0.0,1977,0.072158,B,B,2,629,1,,0,0,A
4999,12504,30,2.0,60.555693,,1.0,10,17.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B


(5000, 19)


In [18]:
# test_df.describe().T

In [19]:
#работаю с площадью
test_df.loc[test_df['LifeSquare'] > test_df['Square'], 'LifeSquare'] = test_df['Square'].mean() / 1.55
test_df.loc[test_df['KitchenSquare'] > test_df['LifeSquare'], 'KitchenSquare'] = test_df['Square'].mean() / 8.35
test_df.loc[test_df['KitchenSquare'] > test_df['Square'], 'KitchenSquare'] = test_df['Square'].mean() / 8.35

test_df.loc[(test_df['LifeSquare'].isnull()) | (test_df['LifeSquare'] > test_df['LifeSquare'].quantile(.98)) |
            (test_df['LifeSquare'] < test_df['LifeSquare'].quantile(.02)), 'LifeSquare'] = test_df['LifeSquare'].median()

In [20]:
#смотрю типы комнат и их количество
test_df['Rooms'].value_counts()

2.0     2030
1.0     1769
3.0     1099
4.0       90
5.0        7
0.0        2
6.0        2
17.0       1
Name: Rooms, dtype: int64

In [21]:
#заменяю "странное" количество комнат на медиану
test_df.loc[test_df['Rooms'].isin([0, 17]), 'Rooms'] = test_df['Rooms'].median()

#меняю этаж
test_df.loc[test_df['HouseFloor'].isin([0, 99]), 'HouseFloor'] = test_df['HouseFloor'].median()

In [22]:
#заменяю категориальные данные на числовые
test_df['Ecology_2_bin'] = test_df['Ecology_2'].replace({'A':0, 'B':1})
test_df['Ecology_3_bin'] = test_df['Ecology_3'].replace({'A':0, 'B':1})
test_df['Shops_2_bin'] = test_df['Shops_2'].replace({'A':0, 'B':1})

In [23]:
# test_df.loc[test_df['Healthcare_1'].isnull(), 'Healthcare_1'] = test_df['Healthcare_1'].median()

In [24]:
test_df.drop(['DistrictId', 'Ecology_2', 'Ecology_3', 'Healthcare_1', 'Shops_2'], axis=1) 

Unnamed: 0,Id,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Ecology_2_bin,Ecology_3_bin,Shops_2_bin
0,725,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,11,2748,1,0,0,1,1,1
1,15856,2.0,69.263183,32.995436,1.0,6,1.0,1977,0.075779,6,1437,3,0,2,1,1,1
2,5480,1.0,13.597819,36.419032,12.0,2,5.0,1909,0.000000,30,7538,87,5,5,1,1,1
3,15664,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,23,4583,3,3,3,1,1,1
4,14275,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,2,629,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,8180,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.000170,36,5992,0,1,1,1,1,1
4996,4695,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,1,264,0,0,1,1,1,1
4997,5783,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,74,19083,2,5,15,1,1,1
4998,4780,2.0,81.305222,32.995436,0.0,4,12.0,1977,0.072158,2,629,1,0,0,1,1,0


# Создаю модель

In [25]:
#указываю признаки для формирования модели
feature_names = ['Rooms', 'Square', 'KitchenSquare', 'Floor', 'LifeSquare', 'HouseFloor', 'HouseYear', 'Ecology_1', 
                 'Ecology_2_bin', 'Ecology_3_bin', 'Social_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'Shops_2_bin']
target_name = ['Price']

In [26]:
x = train_df[feature_names]
y = train_df[target_name]

In [27]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=18)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=feature_names)
x_valid_scaled = pd.DataFrame(x_valid_scaled, columns=feature_names)

rf = RandomForestRegressor(random_state=18, criterion='mse', n_estimators=100)
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=18, verbose=0, warm_start=False)

In [28]:
#обучаю на всё датасете
rf.fit(train_df[feature_names], train_df[target_name])

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=18, verbose=0, warm_start=False)

In [29]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 5)))
    print("Valid R2:\t" + str(round(r2(test_true_values, test_pred_values), 5)))

In [30]:
y_train_preds = rf.predict(x_train)
y_test_preds = rf.predict(x_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)

Train R2:	0.96196
Valid R2:	0.96541


In [31]:
feature_names_test = ['Rooms', 'Square', 'KitchenSquare', 'Floor', 'LifeSquare', 'HouseFloor', 'HouseYear', 'Ecology_1', 
                 'Ecology_2_bin', 'Ecology_3_bin', 'Social_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'Shops_2_bin']

In [32]:
x_test = test_df[feature_names]
y_test_preds = rf.predict(x_test)

In [33]:
#важность признаков
feature_importances = pd.DataFrame(zip(x_train.columns, rf.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature_name,importance
1,Square,0.397
10,Social_1,0.102254
11,Social_2,0.100023
0,Rooms,0.077471
12,Social_3,0.065653
7,Ecology_1,0.051687
6,HouseYear,0.044055
4,LifeSquare,0.036226
3,Floor,0.03181
2,KitchenSquare,0.030653


In [34]:
Result=pd.DataFrame({'id' : test_df['Id'], 'Price': y_test_preds})

In [35]:
Result.to_csv('Desktop/Artyom_DS/Project/Artyom_Volodin_preds.csv', index=False, encoding='utf-8')

In [36]:
#train_df['Square'].describe().T

In [37]:
#train_df['LifeSquare'].describe().T

In [38]:
#train_df['KitchenSquare'].describe().T

In [39]:
#строчка для работы с площадью

#train_df['LifeSquare'].median = np.where((train_df['LifeSquare'] > train_df['Square']), train_df['LifeSquare'], train_df['Square'])

In [40]:
#train_df['Square'].mean() / train_df['LifeSquare'].mean()