In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [20]:
df = pd.read_csv('../features_extractor/flats_data.csv')

X = df.drop(['price'], axis=1)
# Y = df['price'].map(lambda x: np.log(x))
Y = df['price']
X.set_index(X['_id'], inplace=True)
X = X.drop('_id', axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

## Random Forest

In [22]:
records_count = Y.count()
kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

estims = {}

for k in range(1, 75, 5):
    clf = RandomForestRegressor(n_estimators=k, random_state=0)
    quality = cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()
    print(k, quality)
    estims[k] = quality

best_estim = sorted(estims.items(), key=lambda x: -x[1])
print(best_estim[0])

1 0.609883448766
6 0.778076861379
11 0.79419967257
16 0.800504961011
21 0.80300883213
26 0.804648296269
31 0.806125273078
36 0.807225654287
41 0.808010645788
46 0.808224255229
51 0.808888959213
56 0.809275362093
61 0.809603156109
66 0.809695553473
71 0.809592349726
(66, 0.8096955534732031)


In [None]:
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# forest_mse = mean_squared_error(y_pred, y_test)
# forest_rmse = np.sqrt(forest_mse)
# print('Random Forest RMSE: %.4f' % forest_rmse)

In [30]:
clf = RandomForestRegressor(random_state=42)
clf.fit(X, Y)
joblib.dump(clf, 'model_random_forest.pkl') 
print('Random Forest R squared": %.4f' % clf.score(X, Y))

Random Forest R squared": 0.9647


In [24]:
features = X.columns.values
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

indices = indices[:10]
for i in range(0, len(indices)):
    print(str(features[indices[i]]) + " " + str(importances[indices[i]]))

area 0.588482397332
house_type=панельный 0.0663301281207
total_floor 0.0340522479956
construction_year 0.0315176963583
living_area 0.0279902120439
underground_time 0.0251464317988
repair=косметический 0.0234735251906
kitchen_area 0.0220028651872
curr_floor 0.0189931743875
repair=дизайнерский 0.0153549429103


In [None]:
# records_count = Y.count()
# kf = KFold(n=records_count, n_folds=7, shuffle=True, random_state=42)

# def determine_forest_quality(trees_count):
#     clf = RandomForestRegressor(n_estimators = trees_count, random_state=1)
#     return cross_val_score(clf, X, Y, scoring='r2', cv=kf).mean()

# for k in range(1,75,5):
#     quality = determine_forest_quality(k)
#     print (k, quality)

In [None]:
# clf = RandomForestRegressor(n_estimators = 71, random_state=1)
# clf.fit(X, Y)

In [None]:
# joblib.dump(clf, 'model_random_forest.pkl') 

In [None]:
# features = X.columns.values
# importances = clf.feature_importances_
# indices = np.argsort(importances)[::-1]

# num_to_plot = 10
# feature_indices = [ind+1 for ind in indices[:num_to_plot]]

# for i in range(num_to_plot):
#     print (i, features[feature_indices[i]], round(importances[indices[i]],2))

In [6]:
flat = pd.read_csv('/home/alena/Documents/underpriced/one_flat.csv')

In [7]:
flat['underground_name=охотный ряд']

0    1.0
Name: underground_name=охотный ряд, dtype: float64

In [8]:
flat = flat.drop('Unnamed: 0', axis=1)

In [9]:
clf.predict(flat)

array([ 53100.])

In [10]:
round(clf.predict(flat).tolist()[0])

53100

In [16]:
flat

Unnamed: 0,area,combined_bathroom_count,construction_year,curr_floor,has_balcony,has_loggia,house_type=блочный,house_type=деревянный,house_type=кирпично-монолитный,house_type=кирпичный,...,underground_name=шипиловская,underground_name=шоссе энтузиастов,underground_name=щелковская,underground_name=щукинская,underground_name=электрозаводская,underground_name=юго-западная,underground_name=южная,underground_name=ясенево,underground_time,underground_way
0,50.0,1,1976,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,-1


In [18]:
flat['underground_time']

0    12
Name: underground_time, dtype: int64