In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

In [7]:
df = pd.read_csv('flats_features.csv')

In [8]:
X = df.drop({'price'}, axis=1)
Y = df['price']
X.set_index(X['id'], inplace=True)
X = X.drop('id', axis=1)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3643 entries, 158139251 to 160128723
Columns: 228 entries, area to repair=отсутствует
dtypes: float64(220), int64(8)
memory usage: 6.4 MB


In [10]:
records_count = Y.count()
kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

estims = {}

for k in range(1, 75, 2):
    clf = RandomForestRegressor(n_estimators=k, random_state=0)
    quality = cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()
    estims[k] = quality

In [11]:
best_estim = sorted(estims.items(), key=lambda x: -x[1])
print(best_estim)

[(73, 0.85981233300830007), (71, 0.85971401498540168), (65, 0.85957538885450169), (69, 0.85949000373252726), (67, 0.85945514567418457), (63, 0.8594070643894115), (61, 0.85937666352516462), (57, 0.85909868036017223), (59, 0.85907693320814793), (55, 0.85870609962024458), (33, 0.85867173456543389), (49, 0.85864739046261551), (43, 0.85863059138874498), (45, 0.8585754544517844), (53, 0.85856576039654386), (31, 0.85853672931703495), (41, 0.8585030769388784), (39, 0.8584954343823431), (35, 0.85841129402271255), (47, 0.85839382409321296), (37, 0.85838742089025455), (51, 0.85838402015489945), (29, 0.85836288810074335), (27, 0.85748921420716762), (25, 0.85705040509020514), (23, 0.85678731534096697), (21, 0.8557766702889299), (19, 0.85499103942506582), (17, 0.85372827590484524), (15, 0.85360579631792), (13, 0.85189587521328947), (11, 0.85112864871905169), (9, 0.8483353330916269), (7, 0.84639906394960385), (5, 0.84075166099471488), (3, 0.82123185376103813), (1, 0.74131511429303598)]


In [13]:
clf = RandomForestRegressor(n_estimators=best_estim[0][0], random_state=42)
clf.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=73, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [14]:
joblib.dump(clf, 'model_random_forest.pkl') 

['model_random_forest.pkl']

In [15]:
features = X.columns.values
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

indices = indices[:10]
for i in range(0, len(indices)):
    print(str(features[indices[i]]) + " " + str(importances[indices[i]]))

area 0.664398367176
is_center 0.0822684587285
total_floor 0.0323887885789
kitchen_area 0.0237050861557
living_area 0.0213071232131
curr_floor 0.0132585678585
construction_year 0.0130874679004
repair=косметический 0.00897010311235
repair=дизайнерский 0.00790127725865
house_type=не указано 0.00785735387028


In [16]:
predictions = pd.DataFrame(clf.predict(X))[0]
X = X.reset_index()

In [17]:
res_info = pd.DataFrame(columns=[u'Ошибка,%',u'Ошибка,$',u'Цена м.кв.'])
for i in Y.index:
    error = (Y[i] - predictions[i])
    rel_error = error/predictions[i]*100
    res_info.loc[i] = pd.Series({
            u'Ошибка,%':round(rel_error,1),
            u'Ошибка,$':int(error),
            u'Цена м.кв.':int(Y[i] / X['area'][i])
    })

In [18]:
predictions = pd.DataFrame(predictions)

In [19]:
predictions = predictions.set_index(X['id'])

In [20]:
res_info = res_info.set_index(X['id'])

In [21]:
res_info

Unnamed: 0_level_0,"Ошибка,%","Ошибка,$",Цена м.кв.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
158139251,0.5,150.0,1555.0
165841067,-15.2,-10753.0,1714.0
165361751,0.1,68.0,1575.0
164117640,5.9,7808.0,1428.0
165863255,-31.0,-8082.0,473.0
164709012,-8.7,-1726.0,600.0
164698394,-5.5,-4945.0,696.0
165858655,1.7,1301.0,1454.0
164196724,-10.1,-9027.0,1000.0
166018225,2.9,2216.0,1818.0


In [22]:
res_info.sort_values(by=u'Ошибка,%')[:5]

Unnamed: 0_level_0,"Ошибка,%","Ошибка,$",Цена м.кв.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
165952783,-49.4,-13684.0,424.0
165979243,-40.0,-11342.0,425.0
165915719,-39.4,-37767.0,504.0
165842779,-39.3,-6479.0,333.0
165323818,-37.6,-9041.0,375.0


In [23]:
res_info.sort_values(by=u'Ошибка,%', ascending=False)[:5]

Unnamed: 0_level_0,"Ошибка,%","Ошибка,$",Цена м.кв.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
163039230,32.1,41342.0,1666.0
159410103,28.3,37465.0,3695.0
165715483,27.9,37068.0,2125.0
163091975,27.0,34054.0,2000.0
163557797,25.1,35530.0,2461.0


In [24]:
X.set_index(X['id'], inplace=True)
X = X.drop('id', axis=1)

In [26]:
predictions = predictions[0].map(lambda x: int(x))

In [27]:
predictions.to_csv('real_prices.csv')

In [28]:
predictions

id
158139251     27849
165841067     70753
165361751    114931
164117640    132191
165863255     26082
164709012     19726
164698394     89945
165858655     78698
164196724     89027
166018225     77783
164544906     83890
165503427     84246
165797607    183698
151912959     99520
159714441    100205
165798811    176575
164353346    184301
163781069    103767
163925408    121205
164022185     93191
163744750    102191
164857179     73808
164044431     83835
161331858     86383
165718690     80342
157688778     88547
161197783     85041
165648230     83671
164937883    100000
163828377    161849
              ...  
163556770    165753
152012471    161118
149297690    149383
163557128    169474
148731825    167931
163641979    166027
164013713    164794
160083530    157684
30830957     162945
163726762    163424
162940691    152369
163159099    149465
160045989    143273
164209617    166506
158773437    165191
160605953    136424
165443820    143109
162082276    162438
165721063    1649