In [164]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV

In [165]:
df = pd.read_csv('flats_features.csv')

In [166]:
X = df.drop({'price'}, axis=1)
Y = df['price']
X.set_index(X['id'], inplace=True)
X = X.drop('id', axis=1)

In [167]:
records_count = Y.count()
kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

for k in range(1, 75, 2):
    clf = RandomForestRegressor(n_estimators=k, random_state=0)
    quality = cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()
    print (k, quality)

1 0.73101160452
3 0.817924996731
5 0.837341816297
7 0.842190501307
9 0.844217131488
11 0.848141340488
13 0.848809459233
15 0.850567449424
17 0.851057965246
19 0.851885088716
21 0.853253508189
23 0.85413100133
25 0.854452277583
27 0.855231175176
29 0.856498491269
31 0.856820353712
33 0.856833165936
35 0.856827187427
37 0.85712404945
39 0.856914234498
41 0.857066924134
43 0.857390601585
45 0.85743561009
47 0.857408363676
49 0.857745662672
51 0.857616391584
53 0.857545239669
55 0.857573382665
57 0.857858607974
59 0.857827542369
61 0.858033111067
63 0.858169983219
65 0.858438320873
67 0.858412276599
69 0.858503761605
71 0.858900960252
73 0.85906458944


In [192]:
clf = RandomForestRegressor(n_estimators=73, random_state=42)
clf.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=73, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [193]:
features = X.columns.values
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

indices = indices[:10]
for i in range(0, len(indices)):
    print(str(features[indices[i]]) + " " + str(importances[indices[i]]))

index 0.802349374935
area 0.155785095022
total_floor 0.00800506287623
living_area 0.00501798861488
id 0.0029108940069
underground=щукинская 0.00247128997268
kitchen_area 0.00193358152997
has_loggia 0.00190922731792
combined_bathroom_count 0.00181677589114
is_center 0.00152502396922


In [194]:
predictions = pd.DataFrame(clf.predict(X))[0]
X = X.reset_index()

In [196]:
res_info = pd.DataFrame(columns=[u'Ошибка,%',u'Ошибка,$',u'Цена м.кв.'])
for i in Y.index:
    error = (Y[i] - predictions[i])
    rel_error = error/predictions[i]*100
    res_info.loc[i] = pd.Series({
            u'Ошибка,%':round(rel_error,1),
            u'Ошибка,$':int(error),
            u'Цена м.кв.':int(Y[i] / X['area'][i])
    })

In [202]:
predictions = pd.DataFrame(predictions)

In [204]:
predictions = predictions.set_index(X['id'])

In [207]:
res_info = res_info.set_index(X['id'])

In [208]:
res_info

Unnamed: 0_level_0,"Ошибка,%","Ошибка,$",Цена м.кв.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
158139251,-33.5,-14082.0,1555.0
165841067,5.2,2945.0,1714.0
165361751,18.1,17602.0,1575.0
164117640,10.9,13794.0,1428.0
165863255,-44.6,-14465.0,473.0
164709012,-21.7,-5000.0,600.0
164698394,-7.1,-6506.0,696.0
165858655,-4.4,-3643.0,1454.0
164196724,-6.3,-5356.0,1000.0
166018225,-2.5,-2054.0,1818.0


In [209]:
res_info.sort_values(by=u'Ошибка,%')[:5]

Unnamed: 0_level_0,"Ошибка,%","Ошибка,$",Цена м.кв.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
165999010,-49.9,-16959.0,849.0
165842779,-48.4,-9369.0,333.0
165863255,-44.6,-14465.0,473.0
165979243,-41.0,-11808.0,425.0
158139251,-33.5,-14082.0,1555.0


In [210]:
res_info.sort_values(by=u'Ошибка,%', ascending=False)[:5]

Unnamed: 0_level_0,"Ошибка,%","Ошибка,$",Цена м.кв.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
163828377,29.0,44931.0,1538.0
164665343,28.9,11219.0,1428.0
163556579,25.8,40949.0,2498.0
163557679,24.6,16780.0,1666.0
163557812,23.1,33602.0,716.0


In [214]:
X.set_index(X['id'], inplace=True)
X = X.drop('id', axis=1)

In [216]:
X.loc[163828377]

level_0                             29.000000
index                               29.000000
area                               130.000000
combined_bathroom_count              2.000000
construction_year                 2007.000000
kitchen_area                        12.969304
living_area                         54.743862
rooms                                4.000000
has_balcony                          0.000000
has_loggia                           1.000000
curr_floor                          15.000000
total_floor                         15.000000
is_center                            0.000000
underground=авиамоторная             0.000000
underground=автозаводская            0.000000
underground=академическая            0.000000
underground=алексеевская             0.000000
underground=алма-атинская            0.000000
underground=алтуфьево                0.000000
underground=андроновка               0.000000
underground=аннино                   0.000000
underground=арбатская             

In [221]:
predictions = predictions[0].map(lambda x: int(x))

In [222]:
predictions.to_csv('real_prices.csv')

In [223]:
predictions

id
158139251     42082
165841067     57054
165361751     97397
164117640    126205
165863255     32465
164709012     23000
164698394     91506
165858655     83643
164196724     85356
166018225     82054
164544906     83671
165503427     85219
165797607    173958
151912959     94999
159714441     94410
165798811    171780
164353346    165479
163781069     97438
163925408     85999
164022185     94315
163744750    101643
164857179     82876
164044431     82054
161331858     87232
165718690     81712
157688778     89178
161197783     84452
165648230     81780
164937883    102397
163828377    155068
              ...  
163556770    160000
152012471    160178
149297690    160547
163557128    160301
148731825    160452
163641979    166095
164013713    161123
160083530    160547
30830957     160356
163726762    160136
162940691    160246
163159099    163493
160045989    164150
164209617    163287
158773437    163013
160605953    160616
165443820    160397
162082276    160315
165721063    1657