In [1]:
# Ensemble Models

# Ensemble Models
# Ensemble models are machine learning methods that combine several base models toproduce one optimal predictive model.
# They combine decisions from multiple models tomprove the overall performance.

# The Ensemble can be comprised of same algorithms more than once
# Random Forest is an ensemble of decision trees

# Types of Ensemble Methods
# Max Voting: Picks the result based on the majority votes from different models. Generally used in classification problems.
# Averaging: It runs multiple models and then averages the predictions. It can be used in both classification (calculate average of the probabilities) and regression problems.
# Weighted Averaging: It uses multiple models to makepredictions by allocating weights to different models predictions and averaging them out.
# Bagging: It takes results from multiple models and combnes them to get a final result. Decision trees are used frequently with bagging. The main of bagging is to create subsets of the original data and run different models on the subsets; aggregate result; run the models in parallel.
# Boosting: It takes results from multiple models and combines them to get a final result. It creates subsets of the original data and runs different models on the subsets; runs the mdoels sequentially.

In [2]:
# Random Forest

# Random Forest is an ensemble model which follows the bagging method.
# This model uses decision trees to form ensembles.
# This approach is useful for both classification and regression problems.

# How Random Forests work?
# When predicting a new value for a target feature, each tree is either using regression or classification to come up with a value that serves as a "vote".
# It then takes an average of all the votes from all the trees in the ensemble
# This average is the predicted value of the target feature for the variable in question.

In [3]:
# Processes in a Random Forest
# 1) Create a random subset from the original data.
# 2) Randomly select a set of features at each node in the decision tree.
# 3) Decide the best split.
# 4) For each subset of data, create a separate model (a "base learner").
# 5) Compute thefinal prediction by averaging the predictions from all the individual models.

In [4]:
# Advantages of Random Forest
# Easy to understand
# Useful for data exploration
# Reduced data cleaning (scaling not required)
# Handle multiple data types
# Highly flexible and gives a good accuracy
# Works well on large datasets
# Overfitting is avoided (due to averaging)

# Disadvantages of Random Forest
# Overfitting
# Not for continuous variables
# Does not work well with sparse datasets
# Computationally expensive
# No interpretability

In [5]:
# Ensemble methods with Random Forest

import numpy as np
import pandas as pd

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
iris = datasets.load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)
y = pd.DataFrame(iris.target)

In [9]:
y.columns = ['labels']
print(df.head())
y[0:5]

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


Unnamed: 0,labels
0,0
1,0
2,0
3,0
4,0


In [10]:
df.isnull().any() == True

sepal length (cm)    False
sepal width (cm)     False
petal length (cm)    False
petal width (cm)     False
dtype: bool

In [11]:
print(y.labels.value_counts())

0    50
1    50
2    50
Name: labels, dtype: int64


In [13]:
# Preparing the data
X_train, X_test, y_train, y_test = train_test_split(df,y,test_size=0.2,random_state=17)

In [14]:
# Build a Random Forest model
classifier = RandomForestClassifier(n_estimators=200, random_state=0)

# Reformatting the target data
y_train_array = np.ravel(y_train)

classifier.fit(X_train, y_train_array)

y_pred = classifier.predict(X_test)

In [16]:
# Evaluating the model to test the data
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.92      1.00      0.96        11
           2       1.00      0.92      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [17]:
y_test_array = np.ravel(y_test)
print(y_test_array)

[0 1 2 1 2 2 1 2 1 2 2 0 1 0 2 0 0 2 2 2 2 0 2 1 1 1 1 1 0 1]


In [18]:
print(y_pred)

[0 1 2 1 2 2 1 2 1 2 2 0 1 0 2 0 0 2 2 2 1 0 2 1 1 1 1 1 0 1]
