# Random Forest

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from utils.transform_scale import transform_v2_scale_df, TARGET_VARIABLE_COLUMN
import torch

DATA_PATH = Path("data")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load augmented data
train_augmented = pd.read_csv(DATA_PATH / "train-augmented.csv", parse_dates=["month"])
test_augmented = pd.read_csv(DATA_PATH / "test-augmented.csv", parse_dates=["month"])

train_augmented.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,...,mean_age_m,std_age_f,std_age_m,pri_sch_dist,pri_sch,sec_sch_dist,sec_sch,mall_dist,mrt_name,mrt_dist
0,2001-08-01,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,...,36.16763,20.331631,19.999478,0.344087,Loyang Primary School,0.428301,Pasir Ris Crest Secondary School,1.033216,Pasir Ris,1.137522
1,2014-10-01,punggol,5 room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,...,31.967676,20.103889,19.793305,0.160852,Edgefield Primary School,0.312383,Meridian Secondary School,0.80604,Cove,0.118373
2,2020-09-01,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,...,34.164736,20.311337,19.94782,0.184906,Fernvale Primary School,0.55838,Pei Hwa Secondary School,0.452556,Fernvale,0.481153
3,2000-10-01,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,...,40.577282,21.625967,21.440329,0.304561,Pei Tong Primary School,0.619132,Clementi Town Secondary School,0.456499,Clementi,0.42332
4,2013-01-01,bukit batok,3 room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,...,38.318241,20.497124,20.287059,0.233809,Princess Elizabeth Primary School,0.217911,Bukit Batok Secondary School,0.764172,Bukit Batok,0.77422


In [5]:
# See linear.ipynb for details - code copied from there
# Split the train data into train and test
X = train_augmented.drop(columns=TARGET_VARIABLE_COLUMN)
y = train_augmented[TARGET_VARIABLE_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Transform and scale the data
# See utils/transform_scale.py for details
X_train = transform_v2_scale_df(X_train)
X_test = transform_v2_scale_df(X_test)

In [18]:
# Convert the data to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Construct a basic Decision Tree
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=100, random_state=0)
clf = clf.fit(X_train_tensor, y_train_tensor)

In [19]:
# Evaluate the model
with torch.no_grad():
    y_pred = clf.predict(X_test_tensor)
    print(f"Mean squared error: {mean_squared_error(y_test_tensor, y_pred)}")
    print(f"Mean absolute error: {mean_absolute_error(y_test_tensor, y_pred)}")
    print(f"R2 score: {r2_score(y_test_tensor, y_pred)}")

Mean squared error: 326323831.3390358
Mean absolute error: 12750.75682325681
R2 score: 0.9805186236819932


## Hyperparameter Tuning

In [24]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators':[100, 150, 200, 250],
    'max_features': ['sqrt', 'log2', 'auto'],
    'min_samples_split': [2, 3, 5]
}
model = RandomForestRegressor()
clf = GridSearchCV(model, parameters, verbose=3)
model = clf.fit(X_train_tensor, y_train_tensor)

# Store the parameters of the best model
best_params = model.best_params_

# Predict class labels of test data on the model with the best found parameters
y_pred = model.predict(X_test_tensor)

print(f"Mean squared error: {mean_squared_error(y_test_tensor, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test_tensor, y_pred)}")
print(f"R2 score: {r2_score(y_test_tensor, y_pred)}")

print('Best Random Forest regressor: ', best_params)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.979 total time= 1.6min
[CV 2/5] END max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.979 total time= 1.7min
[CV 3/5] END max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.979 total time= 1.6min
[CV 4/5] END max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.979 total time= 1.7min
[CV 5/5] END max_features=sqrt, min_samples_split=2, n_estimators=100;, score=0.979 total time= 1.8min
[CV 1/5] END max_features=sqrt, min_samples_split=2, n_estimators=150;, score=0.979 total time= 3.7min
[CV 2/5] END max_features=sqrt, min_samples_split=2, n_estimators=150;, score=0.979 total time= 2.6min
[CV 3/5] END max_features=sqrt, min_samples_split=2, n_estimators=150;, score=0.979 total time= 2.6min
[CV 4/5] END max_features=sqrt, min_samples_split=2, n_estimators=150;, score=0.979 total time= 2.4min
[CV 5/5] EN

  warn(


[CV 1/5] END max_features=auto, min_samples_split=2, n_estimators=100;, score=0.981 total time= 6.6min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=2, n_estimators=100;, score=0.981 total time= 6.6min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=2, n_estimators=100;, score=0.981 total time= 6.6min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=2, n_estimators=100;, score=0.981 total time= 6.5min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=2, n_estimators=100;, score=0.981 total time= 6.6min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=2, n_estimators=150;, score=0.981 total time= 9.9min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=2, n_estimators=150;, score=0.981 total time= 9.8min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=2, n_estimators=150;, score=0.981 total time=10.3min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=2, n_estimators=150;, score=0.981 total time=10.8min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=2, n_estimators=150;, score=0.981 total time=10.3min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=2, n_estimators=200;, score=0.981 total time=13.9min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=2, n_estimators=200;, score=0.981 total time=12.9min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=2, n_estimators=200;, score=0.981 total time=13.2min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=2, n_estimators=200;, score=0.981 total time=13.1min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=2, n_estimators=200;, score=0.981 total time=13.1min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=2, n_estimators=250;, score=0.981 total time=16.5min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=2, n_estimators=250;, score=0.981 total time=16.4min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=2, n_estimators=250;, score=0.981 total time=16.5min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=2, n_estimators=250;, score=0.981 total time=16.7min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=2, n_estimators=250;, score=0.981 total time=16.6min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=3, n_estimators=100;, score=0.981 total time= 6.4min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=3, n_estimators=100;, score=0.981 total time= 6.4min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=3, n_estimators=100;, score=0.981 total time= 6.4min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=3, n_estimators=100;, score=0.981 total time= 6.4min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=3, n_estimators=100;, score=0.981 total time= 6.4min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=3, n_estimators=150;, score=0.981 total time= 9.7min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=3, n_estimators=150;, score=0.981 total time= 9.7min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=3, n_estimators=150;, score=0.981 total time= 9.7min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=3, n_estimators=150;, score=0.981 total time= 9.6min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=3, n_estimators=150;, score=0.981 total time= 9.8min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=3, n_estimators=200;, score=0.981 total time=13.3min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=3, n_estimators=200;, score=0.981 total time=13.1min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=3, n_estimators=200;, score=0.981 total time=13.1min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=3, n_estimators=200;, score=0.981 total time=13.1min


  warn(


[CV 5/5] END max_features=auto, min_samples_split=3, n_estimators=200;, score=0.981 total time=13.2min


  warn(


[CV 1/5] END max_features=auto, min_samples_split=3, n_estimators=250;, score=0.981 total time=20.1min


  warn(


[CV 2/5] END max_features=auto, min_samples_split=3, n_estimators=250;, score=0.981 total time=22.0min


  warn(


[CV 3/5] END max_features=auto, min_samples_split=3, n_estimators=250;, score=0.981 total time=26.8min


  warn(


[CV 4/5] END max_features=auto, min_samples_split=3, n_estimators=250;, score=0.981 total time=29.4min


  warn(


KeyboardInterrupt: 

In [25]:
X = transform_v2_scale_df(test_augmented)
y_pred = model.predict(X)
df = pd.DataFrame(y_pred, columns=['Predicted'])
# df['Id'] = df.index

import os  
os.makedirs('data', exist_ok=True)  
df.to_csv('data/rf_result.csv', index=True, header=True) 

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test_tensor, y_pred)}")
print(f"Mean absolute error: {mean_absolute_error(y_test_tensor, y_pred)}")
print(f"R2 score: {r2_score(y_test_tensor, y_pred)}")