In [2]:
# Import libraries

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

# Import random forest Regressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Load the dataset

# We will use the California housing data set that we used when learning about decision trees.
df = pd.read_csv('C:\\Users\\User\\github_projects\\Machine_Learning_with_Python\\datasets\\cali_housing.csv')
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
# Arrange data into a features matrix and target vector and train-test-split
# Arrange Data into Features Matrix and Target Vector
y = df['MedHouseVal']
X = df.drop(columns = 'MedHouseVal')

# Split the data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [5]:
# Step 1: Import the model you want to use
# In sklearn, all machine learning models are implemented as Python classes
# This was already imported earlier so commenting it out
#from sklearn.ensemble import RandomForestRegressor

# Step 2: Make an instance of the Model
# This is a place where we can tune the hyperparameters of a model. At the moment, let's use the defaults parameters.  
# You can see that this will be Max_depth = None and n_estimators = 100.  
# These are just a few of the important parameters to explore!
rf = RandomForestRegressor(random_state = 42)
# Looking at some hyperparameters that seem tunable
rf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [6]:
# Step 3: Training the model on the data, storing the information learned from the data
# Model is learning the relationship between X and y. Note that this may take some time to run!
rf.fit(X_train, y_train)

# Step 4: Predict the values for y (remember that this step just allows you to see the actual predictions, but is not necessary for evaluating or tuning your model)
rf.predict(X_test)

# Step 5: Evaluate your model performance
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)
print(rf_train_score)
print(rf_test_score)


0.9726171894816914
0.8078595561901133


In [7]:
# Step 6: Tune your model
# Tuning the max_depth
# As we did previously, we can tune the max_depth we allow for each tree in our random forest. 
# Let's set the max_depth to 9 and evaluate our results.

rf_9 = RandomForestRegressor(max_depth = 9, random_state = 42)
rf_9.fit(X_train, y_train)
rf_9_train_score = rf_9.score(X_train, y_train)
rf_9_test_score = rf_9.score(X_test, y_test)
print(rf_9_train_score)
print(rf_9_test_score)


0.8422908368094096
0.7659811625575986


In [8]:
# Notice that while our results with a max_depth of 9 were optimal for the single tree, this is NOT the case for the random forest! (Our test score decreased)
# To see what the depth of each tree in your random forest was when the max_depth was unlimited, you can use the following code:

[estimator.get_depth() for estimator in rf.estimators_]

# To save space, the output is not shown here,  however notice that the depth of each tree varies.  
# You can try different values for max_depth or other parameters to see if you can make improvements on the default model.


[34,
 32,
 35,
 32,
 32,
 31,
 30,
 33,
 33,
 33,
 32,
 29,
 32,
 34,
 32,
 32,
 30,
 31,
 31,
 32,
 34,
 30,
 31,
 33,
 32,
 32,
 32,
 35,
 30,
 37,
 31,
 31,
 34,
 32,
 29,
 33,
 33,
 31,
 32,
 29,
 34,
 36,
 34,
 31,
 32,
 31,
 32,
 33,
 33,
 33,
 32,
 35,
 36,
 32,
 37,
 32,
 32,
 35,
 31,
 33,
 34,
 32,
 31,
 32,
 33,
 31,
 34,
 30,
 30,
 32,
 31,
 33,
 32,
 33,
 33,
 30,
 34,
 32,
 42,
 36,
 31,
 33,
 30,
 32,
 31,
 37,
 37,
 34,
 33,
 31,
 33,
 33,
 30,
 33,
 32,
 34,
 31,
 31,
 33,
 34]

In [9]:
# Tuning n_estimators (# of decision trees)
# Another tuning parameter is n_estimators, which represents the number of trees that should be grown. 
# The code below can take some time to run. The reason is that when you train an ensemble you are training more than one model (in this case tree).  
# Let's see if we can improve our score by doubling the amount of trees from 100 to 200.  

# Try 200 trees
rf_200 = RandomForestRegressor(n_estimators = 200, random_state = 42)

# Fit the model
rf_200.fit(X_train, y_train)

# Obtain the scores
rf_200_train_score = rf_200.score(X_train, y_train)
rf_200_test_score = rf_200.score(X_test, y_test)
print(rf_200_train_score)
print(rf_200_test_score)


0.9736513985866985
0.8094548562033245


In [None]:
# Notice that, in this case,  we hardly had any noticeable improvement with 200 trees compared to the default 100 trees.  
# This will depend on your data.  