In [None]:
#Data Processing
import pandas as pd
import numpy as np

# Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image

We will use Possum dataset to predict a possum’s sex based on its characteristics like belly size, skull width, etc. ead the data in, and investigate five rows from it at random with:

In [None]:
#Read data file
df = pd.read_csv("possum.csv")
df.sample(5, random_state=44)

Let’s do a quick clean up and remove any rows with missing data with:


In [None]:
df = df.dropna()

Now let’s remove the unnecessary columns, then store the features and the label data in separate variables:

In [None]:
X = df.drop(["case", "site", "Pop", "sex"], axis=1)
y = df["sex"]

We will split out data into training and testing data. Then we’ll create our first random forest:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

rf_model = RandomForestClassifier(n_estimators=50, max_features="auto", random_state=44)
rf_model.fit(X_train, y_train)

In RandomForestClassifier model, here is the parameters:
 - n_estimators determines the number of decision trees that make up our random forest. 
 - max_features defines the number of features that each decision tree takes into consideration at each split.

In [None]:
predictions = rf_model.predict(X_test)
predictions

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

If you’d like to know how important each feature is in predicting a possum’s sex, that’s also possible with feature_importances_:

In [None]:
importances = rf_model.feature_importances_
columns = X.columns
i = 0
while i < len(columns):
    print(f" The importance of feature '{columns[i]}' is {round(importances[i]*100,2)}%.")
    i += 1

The random forest model is ready, so feel free to feed any new data to it to make a new prediction:

In [None]:
new_possum = [[7.0, 83.2, 54.3, 81.0, 37.0, 70.0, 46.3, 14.7, 25.0, 32.0]]
rf_model.predict(new_possum)












Now we are going to discuss the Random Forest Regression
First import the random forest regressor from sklearn libraries.


In [None]:
from sklearn.ensemble import RandomForestRegressor # for building the model
from sklearn.metrics import mean_squared_error # for calculating the RMSE

In [None]:
#Read data file
dataset = pd.read_csv("petrol_consumption.csv")
dataset.sample(5, random_state=44)

We will seperate the independent variables and the dependent variable. Our concern is to model the relationships between the features (Petrol_tax, Average_income, etc.) and the target variable (Petrol_consumption) in the dataset.

In [None]:
x = dataset.drop('Petrol_Consumption', axis = 1) # Features
y = dataset['Petrol_Consumption']  # Target

In [None]:
# Splitting the dataset into training and testing set (80/20)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 28)

In [None]:
# Initializing the Random Forest Regression model with 10 decision trees
model = RandomForestRegressor(n_estimators = 10, random_state = 0)

# Fitting the Random Forest Regression model to the data
model.fit(x_train, y_train)

In [None]:
# Predicting the target values of the test set
y_pred = model.predict(x_test)
y_pred

In [None]:
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)),'.3f'))
print("\nRMSE:\n",rmse)