In [60]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [61]:
df = pd.read_csv('../features_extractor/flats_data.csv')

X = df.drop(['price'], axis=1)
# Y = df['price'].map(lambda x: np.log(x))
Y = df['price']
X.set_index(X['_id'], inplace=True)
X = X.drop('_id', axis=1)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

## Random Forest

In [63]:
records_count = Y.count()
kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

estims = {}

for k in range(1, 102, 3):
    clf = RandomForestRegressor(n_estimators=k, random_state=0)
    quality = cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()
    print(k, quality)
    estims[k] = quality

best_estim = sorted(estims.items(), key=lambda x: -x[1])
print(best_estim[0])

1 0.609883448766
4 0.75964269008
7 0.782880888876
10 0.791748530452
13 0.797337565664
16 0.800504961011
19 0.801913834286
22 0.802718360377
25 0.804321315072
28 0.80557328692
31 0.806125273078
34 0.806737770683
37 0.807513359199
40 0.807897334014
43 0.808096720911
46 0.808224255229
49 0.80887406181
52 0.809054652177
55 0.809350835105
58 0.809518851927
61 0.809603156109
64 0.809482362063
67 0.809727821262
70 0.809421423462
73 0.809701506208
76 0.809925277936
79 0.809955206658
82 0.810026384769
85 0.809807247548
88 0.809989579063
91 0.81008962415
94 0.810261675023
97 0.810234650228
100 0.810304315837
(100, 0.81030431583731932)


In [None]:
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# forest_mse = mean_squared_error(y_pred, y_test)
# forest_rmse = np.sqrt(forest_mse)
# print('Random Forest RMSE: %.4f' % forest_rmse)

In [64]:
clf = RandomForestRegressor(n_estimators=100, random_state=42)
clf.fit(X, Y)
joblib.dump(clf, 'model_random_forest.pkl') 
print('Random Forest R squared": %.4f' % clf.score(X, Y))

Random Forest R squared": 0.9751


In [65]:
features = X.columns.values
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

indices = indices[:10]
for i in range(0, len(indices)):
    print(str(features[indices[i]]) + " " + str(importances[indices[i]]))

area 0.588580643472
house_type=панельный 0.0666016079381
total_floor 0.0338000677503
construction_year 0.0313830618577
living_area 0.0282699016693
underground_time 0.0253086440353
repair=косметический 0.0231080768388
kitchen_area 0.0221894125457
curr_floor 0.0191888554519
repair=дизайнерский 0.0154844377289


In [None]:
# records_count = Y.count()
# kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

# def determine_forest_quality(trees_count):
#     clf = RandomForestRegressor(n_estimators = trees_count, random_state=1)
#     return cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()

# for k in range(1,75,5):
#     quality = determine_forest_quality(k)
#     print (k, quality)

In [None]:
# clf = RandomForestRegressor(n_estimators = 71, random_state=1)
# clf.fit(X, Y)

In [None]:
# joblib.dump(clf, 'model_random_forest.pkl') 

In [None]:
# features = X.columns.values
# importances = clf.feature_importances_
# indices = np.argsort(importances)[::-1]

# num_to_plot = 10
# feature_indices = [ind+1 for ind in indices[:num_to_plot]]

# for i in range(num_to_plot):
#     print (i, features[feature_indices[i]], round(importances[indices[i]],2))

In [66]:
flat = pd.read_csv('/home/alena/Documents/underpriced/one_flat.csv')

In [67]:
flat['underground_name=охотный ряд']

0    1.0
Name: underground_name=охотный ряд, dtype: float64

In [68]:
flat = flat.drop('Unnamed: 0', axis=1)

In [69]:
clf.predict(flat)

array([ 43320.])

In [70]:
round(clf.predict(flat).tolist()[0])

43320