In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [6]:
train_2016 = pd.read_csv('./../preprocessed data/df_train_2016.csv')
predict_2016 = pd.read_csv('./../preprocessed data/df_predict_2016.csv')

train_2017 = pd.read_csv('./../preprocessed data/df_train_2017.csv')
predict_2017 = pd.read_csv('./../preprocessed data/df_predict_2017.csv')

sample = pd.read_csv("./../data/sample_submission.csv")

In [14]:
def outliner_drop(df):
    Q1 = df['logerror'].quantile(0.25)
    Q3 = df['logerror'].quantile(0.75)
    IQR = Q3-Q1
    lower_lim = Q1 - 1.5*IQR
    upper_lim = Q3 + 1.5*IQR
    df=df[df.logerror >= lower_lim ]
    df=df[df.logerror <= upper_lim ]
    return df

In [7]:
# Fraction of original dataset given to any individual tree
max_samples = [0.1, 0.2]
# Number of features to consider at every split
max_features = ['auto', 0.4, 0.6, 0.8, 0.9]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 100, num = 4)]
# max_depth.append(None)
# Minimum number of samples required at each leaf node
min_samples_leaf = [16, 32, 48]

# Create the tuning grid
param_grid = {
    'max_samples': max_samples,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
}

In [15]:
train_16 = train_2016.copy()
train_16 = outliner_drop(train_16)
y_16 = train_16['logerror']
train_16 = train_16.drop(['logerror'],axis=1)

predict_16 = predict_2016.drop(['parcelid'],axis=1)

In [17]:
rf = RandomForestRegressor()

# Grid search of parameters, using 3 fold cross validation
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the random search model
rf_grid.fit(train_16, y_16)

best_rf = rf_grid.best_estimator_
best_rf

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  7.9min finished


RandomForestRegressor(max_depth=50, max_features=0.9, max_samples=0.2,
                      min_samples_leaf=16)

In [18]:
best_params = {
    'n_estimators': 400,
    'max_samples': 0.2,
    'max_depth': 50,
    'max_features': 0.9,
    'min_samples_leaf': 16,    
    'random_state': 42, 
}

ran_forest = RandomForestRegressor(**best_params)
ran_forest.fit(train_16, y_16)
predict_y = ran_forest.predict(predict_16)

In [19]:
sample['201610'] = predict_y 
sample['201611'] = predict_y 
sample['201612'] = predict_y 

In [20]:
train_17 = train_2017.copy()
train_17 = outliner_drop(train_17)
y_17 = train_17['logerror']
train_17 = train_17.drop(['logerror'],axis=1)

predict_17 = predict_2017.drop(['parcelid'],axis=1)

In [21]:
rf = RandomForestRegressor()

# Grid search of parameters, using 3 fold cross validation
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the random search model
rf_grid.fit(train_17, y_17)

best_rf = rf_grid.best_estimator_
best_rf

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  6.6min finished


RandomForestRegressor(max_depth=83, max_features=0.4, max_samples=0.2,
                      min_samples_leaf=16)

In [22]:
best_params = {
    'n_estimators': 400,
    'max_samples': 0.2,
    'max_depth': 83,
    'max_features': 0.4,
    'min_samples_leaf': 16,    
    'random_state': 42, 
}

ran_forest = RandomForestRegressor(**best_params)
ran_forest.fit(train_17, y_17)
predict_y = ran_forest.predict(predict_17)

In [23]:
sample['201710'] = predict_y 
sample['201711'] = predict_y 
sample['201712'] = predict_y 

In [25]:
sample.to_csv('./../submission/rdf.csv', index=False, float_format='%.6f')

In [24]:
sample

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.008671,0.008671,0.008671,0.013351,0.013351,0.013351
1,10759547,0.009578,0.009578,0.009578,0.014124,0.014124,0.014124
2,10843547,0.028944,0.028944,0.028944,0.022783,0.022783,0.022783
3,10859147,0.020052,0.020052,0.020052,0.018031,0.018031,0.018031
4,10879947,0.005760,0.005760,0.005760,0.005913,0.005913,0.005913
...,...,...,...,...,...,...,...
2985212,168176230,-0.004642,-0.004642,-0.004642,0.009699,0.009699,0.009699
2985213,14273630,-0.004642,-0.004642,-0.004642,0.010581,0.010581,0.010581
2985214,168040630,-0.004642,-0.004642,-0.004642,0.001412,0.001412,0.001412
2985215,168040830,-0.004642,-0.004642,-0.004642,0.009686,0.009686,0.009686
