In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error



In [2]:
df = pd.read_csv('../features_extractor/flats_data.csv')

X = df.drop(['price'], axis=1)
Y = df['price'].map(lambda x: np.log(x))
X.set_index(X['_id'], inplace=True)
X = X.drop('_id', axis=1)

In [3]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

## Random Forest

In [4]:
records_count = Y.count()
kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

estims = {}

for k in range(1, 100, 1):
    clf = RandomForestRegressor(n_estimators=k, random_state=0)
    quality = cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()
    print(k)
    estims[k] = quality

best_estim = sorted(estims.items(), key=lambda x: -x[1])
print(best_estim[0])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
(97, 0.84866218592625575)


In [5]:
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# forest_mse = mean_squared_error(y_pred, y_test)
# forest_rmse = np.sqrt(forest_mse)
# print('Random Forest RMSE: %.4f' % forest_rmse)

In [9]:
clf = RandomForestRegressor(n_estimators=11, random_state=42)
clf.fit(X, Y)
joblib.dump(clf, 'model_random_forest.pkl') 

['model_random_forest.pkl']

In [10]:
features = X.columns.values
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

indices = indices[:10]
for i in range(0, len(indices)):
    print(str(features[indices[i]]) + " " + str(importances[indices[i]]))

area 0.642361373296
house_type=панельный 0.0529477046736
repair=косметический 0.0466349488262
underground_time 0.0261422928911
living_area 0.024465243478
construction_year 0.0208864312481
underground_way 0.0205971023672
total_floor 0.0198949248184
kitchen_area 0.0159243411754
curr_floor 0.0147814443935


In [8]:
def determine_forest_quality(trees_count):
    clf = RandomForestRegressor(n_estimators = trees_count, random_state=1)
    return cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()

for k in range(1,75,5):
    quality = determine_forest_quality(k)
    print (k, quality)

1 0.720604407543
6 0.830368044059
11 0.838535690564
16 0.842318423207
21 0.844704669274
26 0.845012558499
31 0.846019658097
36 0.846022954838
41 0.846517775097
46 0.847194903361
51 0.847764569028
56 0.848022025893
61 0.847916731436
66 0.848003189495
71 0.848171344753


In [12]:
features = X.columns.values
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

num_to_plot = 10
feature_indices = [ind+1 for ind in indices[:num_to_plot]]

for i in range(num_to_plot):
    print (i, features[feature_indices[i]], round(importances[indices[i]],2))

0 combined_bathroom_count 0.64
1 house_type=сталинский 0.05
2 repair=отсутствует 0.05
3 underground_name=авиамоторная 0.03
4 rooms 0.02
5 kitchen_area 0.02
6 underground_time 0.02
7 is_center 0.02
8 living_area 0.02
9 total_floor 0.01
