In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
# https://www.kaggle.com/hoonkaiwei/ensemble-learning-basics-with-cars-dataset
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### <span style="color:red;">This is only a draft, Final to be exported in a folder with dataset</span>

# Week 4 Co-Learning: Ensemble Methods with Cars Dataset 

___

So after covering the theory behind ensemble methods, let us dive into the coding segments of today's co-learning 

___ 

We'll first have a quick recap on how to implement decision trees in Python. After that we'll move on to some data cleaning and preparation, followed by implementations of various ensemble methods and tweaking of their parameters. 

___

Now we'll first import all the necessary modules and packages. 

We'll be using Scikit-Learn as our main library for all the various classifier models we'll work with today 

In [None]:
# General modules

import numpy as np
import pandas as pd
import matplotlib
from matplotlib.legend_handler import HandlerLine2D 
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

# Required Classifiers from SKLearn 

from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split 

# Configuring Matplotlib 
%matplotlib inline 

# Let's have a quick recap of Descision Trees

We will use the iris dataset for a quick recap. 

The iris dataset contains various features, such as petal length, petal width and sepal width, with the target being classifying a given iris flower under its correct species. 

We would use petal length and petal width as features to predict the species of iris flower for our recap on decision trees. 

In [None]:
# Importing the dataset 

from sklearn.datasets import load_iris

In [None]:
# Loading the dataset

iris = load_iris()

In [None]:
# Selecting only the desired features of petal width and petal length for this recap
# Experiment with a only two levels in the tree (not including root node)

X = iris.data[:,2:]
y = iris.target 

iris_tree = DecisionTreeClassifier(max_depth=2)
iris_tree.fit(X, y)

So actually creating and fitting a decision tree is relatively simple. 

Earlier we mentioned that at each non-leaf node, the decision tree makes a decision that splits the samples into corresponding children nodes. If we want to know what decision each node made, we can actually visualize it. 

## Visualizing the decision tree

___ 

Let's display the decision tree in a more visually accessible format. 

___ 

We can do this in 2 ways, but let's first use the method the textbook advocates.

In [None]:
# Method 1 (textbook): Using export_graphviz
# Using export_graphviz to visualize the decision tree

export_graphviz(iris_tree, out_file='iris_tree.dot',feature_names=iris.feature_names[2:], class_names=iris.target_names, rounded=True, filled=True)

In [None]:
!dot -Tpng iris_tree.dot -o iris_tree.png

In [None]:
# Loading the image of the visualized decision tree to display it using Matplotlib

fig, axes = plt.subplots(figsize = (20, 8))
iris_img = img_to_array(load_img('./iris_tree.png'))
plt.suptitle("Decision tree structure")
plt.imshow(iris_img.astype('uint8')) 

In [None]:
# Method 2 (by Scikit-Learn): Using plot_tree()

fig, axes = plt.subplots(figsize = (20, 8))
plot_tree(iris_tree, filled=True)
plt.show()

From this image, we can see that Sklearn's Decision Tree Algorithm, CART, is a greedy algorithm that tries to place as many of one class in one children node as possible at each decision node split

In [None]:
# Plot the decision boundary

fig, axes = plt.subplots(figsize = (20, 8))

# Determine the axes' units and ranges
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02))
plt.tight_layout(h_pad=1.0, w_pad=1.0, pad=2.5)

# Plotting the decision boundaries
Z = iris_tree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

# Determining the axes labels
plt.xlabel(iris.feature_names[2])
plt.ylabel(iris.feature_names[3])

# Plotting the training points in the graph
for i, color in zip(range(3), 'ryb'):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")

In this sense of being able to see how the model decides, decision trees are \_\_\_\_\_\_\_\_ models unlike Neural Networks

## Adding more layers

Instead of having 2 layers, excluding the root node, let's try a decision tree with 3 layers. We'll then visualize what differences more layers create in our model

In [None]:
# Fitting again with 3 layers instead 
# ICE 

iris_tree = DecisionTreeClassifier(max_depth=3)
iris_tree.fit(X, y)

fig, axes = plt.subplots(figsize = (20, 8))
plot_tree(iris_tree, filled=True)
plt.show()

In [None]:
# Plotting the new decision boundaries
# ICE

fig, axes = plt.subplots(figsize = (20, 8))

# Determine the axes' units and ranges
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02))
plt.tight_layout(h_pad=1.0, w_pad=1.0, pad=2.5)

# Plotting the decision boundaries
Z = iris_tree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

# Determining the axes labels
plt.xlabel(iris.feature_names[2])
plt.ylabel(iris.feature_names[3])

# Plotting the training points in the graph
for i, color in zip(range(3), 'ryb'):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")

The model becomes slightly more precise but this can easily become a case of \_\_\_\_\_\_\_\_ which leads to \_\_\_\_\_\_\_\_ 

## Tweaking other hyper parameters 

What are the other hyperparameters we can tweak? 

In [None]:
# Tweaking the other hyperparameters
# ICE 

iris_tree = DecisionTreeClassifier(max_depth=3)
iris_tree.fit(X, y)

fig, axes = plt.subplots(figsize = (20, 8))
plot_tree(iris_tree, filled=True)
plt.show()

In [None]:
# Plotting the new decision boundaries
# ICE

fig, axes = plt.subplots(figsize = (20, 8))

# Determine the axes' units and ranges
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02))
plt.tight_layout(h_pad=1.0, w_pad=1.0, pad=2.5)

# Plotting the decision boundaries
Z = iris_tree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

# Determining the axes labels
plt.xlabel(iris.feature_names[2])
plt.ylabel(iris.feature_names[3])

# Plotting the training points in the graph
for i, color in zip(range(3), 'ryb'):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")

## Gini index vs. Entropy

___

Previously Jason mentioned that there are 2 ways that a decision tree can decide to split its node by: 
  1. Gini index
  2. Entropy 

Now let's see what difference using each makes. We'll comparing between to models of 3 layers

In [None]:
# Gini-using model
# ICE

fig, axes = plt.subplots(figsize = (20, 8))
plot_tree(iris_tree, filled=True)
plt.show()

In [None]:
# Entropy-using model
# ICE

iris_tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)
iris_tree.fit(X, y)

fig, axes = plt.subplots(figsize = (20, 8))
plot_tree(iris_tree, filled=True)
plt.show()

As can be seen from this example, there is only minor differences between the models --- the result is the same in this case but the decision criterias are somewhat different 

___

The models we have built so far are used for classification problems, due to the nature of the dataset, but decision trees can also be used for regression problems. In that case, the __DecisionTreeRegressor__ class is used, the loss measured is \_\_\_\_\_\_\_\_

## Now, we'll move on to Ensemble Learning methods proper

# Ensemble Learning 

### We'll go through the implementation of the various ensemble methods that were introduced in the following part, namely: 
  1. Voting Classifier
  2. Bagging Classifier
  3. Random Forests
  4. Adaboost
  5. Gradient Boosting 

___

But as always, before we start fitting the models, we have to clean and prepare the data, as well as, conduct some basic Exploratory Data Analysis (EDA). 

## Data Cleaning, EDA and Data Preparation

Data Cleaning: 
  1. Reviewing cars data and data types
  2. Removing null values

EDA: 
  1. Looking at the statistical summary of our data
  2. Observing correlation in our data
  3. Observing spread of each feature in our data
  4. Observing the distribution of our target class

Data Preparation: 
  1. Split into training and testing datasets

In [None]:
# Loading our data from storage 

df = pd.read_csv('../input/carsdata/cars.csv', skipinitialspace=True, na_values=' ')
# Skipinitialspace remove whitespaces before titles and na_values determines what are considered null values 

### Reviewing Cars_data

In [None]:
# Initial look at data

df.head()

In [None]:
# Statistical summary of our data

df.info()

It seems our dataset contains quite a few null values, let's take a look at the null values and determine whether to fill or drop them 

### Dropping null values 

In [None]:
# Check null values in 'cubicinches' column
# ICE

df['cubicinches'][pd.isna(df['cubicinches'])]

In [None]:
# Check null values in 'weightlbs' column
# ICE

df['weightlbs'][pd.isna(df['weightlbs'])]

Given the small number of null records compared to the whole dataset and the difficulty in giving a suitable fill value for these null values, we should proceed to drop them. 

In [None]:
df = df.dropna(how='any')

Let us have a quick look at the dataset again

In [None]:
df.info()

In [None]:
df.describe()

### Observing the correlation of the data, the distribution of features and target classes

In [None]:
# Plotting correlation map

fig, axes = plt.subplots(figsize = (12, 6))

correlation = df.corr()

corr_m = sns.heatmap(round(correlation, 2), annot=True, cmap='Blues', ax=axes, fmt='.2f')

Correlation of data can be used to help reduce the dimensionality of models in some cases by removing features that are redundant (of very high correlation)

In [None]:
# Function to help plot the distribution of our features

def features_hist(df, features, fig_size, xsize=8, ysize=8):
    df[features].hist(bins=20, xlabelsize=xsize, ylabelsize=ysize, grid=False, figsize=fig_size,color='blue')

    plt.tight_layout(rect=(0,0,1.2,1.2))

    plt.show()

In [None]:
# Separating the dataframe into features and labels
# ICE 

X = df.iloc[:,:7] 

X

In [None]:
y = df['brand']

y

In [None]:
# Features of our dataset

features = list(df.columns[:7])

features

In [None]:
features_hist(df, features, (20,8))

In [None]:
features_hist(df, 'brand', (20, 8), xsize=20, ysize=10)

In [None]:
df['brand'].value_counts()

### Now let us split the cars dataset into training and test datasets

In [None]:
# train_perc = 0.85 

# train_index = ceil(len(df) * 0.85)
# print(train_index)

# X_train = X.iloc[:train_index] 
# X_test = X.iloc[train_index:]

# y_train = y[:train_index]
# y_test = y[train_index:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)

In [None]:
decisiontree_results = []

In [None]:
cars_tree = DecisionTreeClassifier(min_samples_leaf=7)

cars_tree.fit(X_train,y_train)

In [None]:
decisiontree_results.append(cars_tree.score(X_test, y_test))

print(decisiontree_results)

## Voting Classifier

Before we delve into specific ensemble techniques, let us construct a basic ensemble method model from scratch, using a voting classifier, as covered by Jiaying earlier. We'll be using a hard voting classifier. 

___

As mentioned earlier, a hard-voting classifier is made up of a collection of \_\_\_\_\_\_\_\_, with the final decision being decided by \_\_\_\_\_\_\_\_. 

___

We'll be using a logistic regression model, a decision tree classifier and a k-nearest neighbour classifier in our Voting classifier

In [None]:
# Forming the hard-voting classifier 

voter = VotingClassifier(estimators=[('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()), ('svc', SVC())], voting='hard', n_jobs=-1)

# Fitting the VotingClassifier 

results = [] 

voter.fit(X_train, y_train)

# VotingClassifier accuracy

results.append(f"VotingClassifier Accuracy: {accuracy_score(y_test, voter.predict(X_test))}")

In [None]:
# Fitting individual classifiers

log_reg = LogisticRegression()
dec_tree = DecisionTreeClassifier(max_depth=3)
svc = SVC() 

log_reg.fit(X_train, y_train)
dec_tree.fit(X_train, y_train)
svc.fit(X_train, y_train)

# Checking accuracy of different classifiers

results.append(f"LogisticRegression Accuracy: {accuracy_score(y_test, log_reg.predict(X_test))}")
results.append(f"DecisionTreeClassifier Accuracy: {accuracy_score(y_test, dec_tree.predict(X_test))}")
results.append(f"Support Vector Machine Accuracy: {accuracy_score(y_test, svc.predict(X_test))}")

for result in results: 
    print(result)

If we compare the accuracy of the Voting Classifier, we can see the difference an ensemble of weak learners can make. 

--- 

If we used soft-voting instead, which means \_\_\_\_\_\_\_\_, the accuracy would potentially be even higher. 

## Bagging Classifier

In using a Bagging Classifier, we use random sampling to train various weak learners, forming an ensemble. The randomness of the sample each learner fits to help make better and more accurate predictions. 

___ 

The BaggingClassifier class can be used for both bagging and pasting, we just need to change the parameter \_\_\_\_\_\_\_\_

In [None]:
# Fitting a BaggingClassifier
# ICE 

bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, max_samples=100, bootstrap=True, n_jobs=-1)

bag.fit(X_train, y_train)

# ICE 

print(f"Accuracy: {accuracy_score(y_test, bag.predict(X_test))}")

If we're just using a bagging classifier with decision trees as its weak learners, we can just dirctly use the RandomForestClassifier, which is essentially an ensemble method working in a similar way. 

___

Bagging classifiers can, of course, use different weak learner models. 

## Bagging - Random Forests

Now let's have a look at Bagging ensemble methods using Random Forests

In [None]:
# Fitting a Random Forest model

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Random Forest accuracy

y_pred = rf.predict(X_test)
print(f"accuracy score: {accuracy_score(y_test, y_pred)}")

We can actually access the importance of a feature across all the estimators trained in a random forest using sklearn's feature_importances_ variable

In [None]:
# Print feature importance

for name, importance in zip(features, rf.feature_importances_): 
    print(name, importance)

Let's tweak some hyperparameters to see the differences in performance. 

In [None]:
# Results for different hyperparameters
# ICE 

n_estimators = [5, 10, 50, 100, 150, 300]
train_results = []
test_results = []

for estimator in n_estimators: 
    rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    train_pred = rf.predict(X_train)
    train_results.append(accuracy_score(y_train, train_pred))
    test_pred = rf.predict(X_test)
    test_results.append(accuracy_score(y_test, test_pred)) 

print('Train\t\t\tTest')
for i in range(len(n_estimators)):
    print(f"{i+1}: {round(train_results[i], 5):.5f}\t\t{round(test_results[i],5):.5f}")

In [None]:
# Graphical representation

line1 = plt.plot(n_estimators, train_results, 'b', label='Training')
line2 = plt.plot(n_estimators, test_results, 'r', label='Testing') 
plt.legend(handler_map={'line1':HandlerLine2D(numpoints=2)})
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.show()

## Boosting - AdaBoost 

For boosting techniques, we have 2 common kinds, the first will be AdaBoost. 

In [None]:
# Fitting an AdaBoost model

am = AdaBoostClassifier(n_estimators=100, learning_rate=1)
adaboostmodel = am.fit(X_train, y_train)

y_pred = adaboostmodel.predict(X_test)
print(f"accuracy score: {accuracy_score(y_test, y_pred)}")

In [None]:
# Tweaking hyperparameters
# ICE 

n_estimators = [5, 10, 50, 100, 150, 300]
train_results = []
test_results = []

for estimator in n_estimators: 
    am = AdaBoostClassifier(n_estimators=estimator, learning_rate=1)
    adamodel = am.fit(X_train, y_train)
    
    train_pred = adamodel.predict(X_train)
    train_results.append(accuracy_score(y_train, train_pred))
    test_pred = adamodel.predict(X_test)
    test_results.append(accuracy_score(y_test, test_pred)) 

print('Train\t\t\tTest')
for i in range(len(n_estimators)):
    print(f"{i+1}: {round(train_results[i], 5):.5f}\t\t{round(test_results[i],5):.5f}")

In [None]:
line1 = plt.plot(n_estimators, train_results, 'b', label='Training')
line2 = plt.plot(n_estimators, test_results, 'r', label='Testing') 
plt.legend(handler_map={'line1':HandlerLine2D(numpoints=2)})
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.show()

How about tweaking the learning rate? 

In [None]:
# ICE

n_lr = [0.4, 0.8, 1.0, 1.4, 1.8, 2.0]
train_results = []
test_results = []

for lr in n_lr: 
    am = AdaBoostClassifier(n_estimators=300, learning_rate=lr)
    adamodel = am.fit(X_train, y_train)
    
    train_pred = adamodel.predict(X_train)
    train_results.append(accuracy_score(y_train, train_pred))
    test_pred = adamodel.predict(X_test)
    test_results.append(accuracy_score(y_test, test_pred)) 

print('Train\t\t\tTest')
for i in range(len(n_lr)):
    print(f"{i+1}: {round(train_results[i], 5):.5f}\t\t{round(test_results[i],5):.5f}")

In [None]:
line1 = plt.plot(n_lr, train_results, 'b', label='Training')
line2 = plt.plot(n_lr, test_results, 'r', label='Testing') 
plt.legend(handler_map={'line1':HandlerLine2D(numpoints=2)})
plt.xlabel('n_lr')
plt.ylabel('accuracy')
plt.show()

## Boosting - Gradient Boosting

The other boosting technique is Gradient Boosting. 

___

We'll be using a Classifier instead of a Regressor due to the nature of our dataset, but both regressors and classifiers can use the gradient boosting method.

In [None]:
# Fitting the Gradient Boosting model

gm = GradientBoostingClassifier(max_depth=2, n_estimators=10, learning_rate=1)
gradboostmodel = gm.fit(X_train, y_train)

y_pred = gradboostmodel.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
# Tweaking the hyperparameters
# ICE

n_estimators = [5, 10, 50, 100, 150, 300]
train_results = []
test_results = []

for estimator in n_estimators: 
    gm = GradientBoostingClassifier(max_depth = 2, n_estimators=estimator, learning_rate=1)
    gradmodel = gm.fit(X_train, y_train)
    
    train_pred = gradmodel.predict(X_train)
    train_results.append(accuracy_score(y_train, train_pred))
    test_pred = gradmodel.predict(X_test)
    test_results.append(accuracy_score(y_test, test_pred)) 

print('Train\t\t\tTest')
for i in range(len(n_estimators)):
    print(f"{i+1}: {round(train_results[i], 5):.5f}\t\t{round(test_results[i],5):.5f}")

In [None]:
line1 = plt.plot(n_estimators, train_results, 'b', label='Training')
line2 = plt.plot(n_estimators, test_results, 'r', label='Testing') 
plt.legend(handler_map={'line1':HandlerLine2D(numpoints=2)})
plt.xlabel('n_estimators')
plt.ylabel('accuracy')
plt.show()

In [None]:
# Tweaking the learning rate
# ICE

n_lr = [0.4, 0.8, 1.0, 1.4, 1.8, 2.0]
train_results = []
test_results = []

for lr in n_lr: 
    gm = GradientBoostingClassifier(max_depth = 2, n_estimators=100, learning_rate=lr)
    gradmodel = gm.fit(X_train, y_train)
    
    train_pred = gradmodel.predict(X_train)
    train_results.append(accuracy_score(y_train, train_pred))
    test_pred = gradmodel.predict(X_test)
    test_results.append(accuracy_score(y_test, test_pred)) 

print('Train\t\t\tTest')
for i in range(len(n_lr)):
    print(f"{i+1}: {round(train_results[i], 5):.5f}\t\t{round(test_results[i],5):.5f}")

In [None]:
line1 = plt.plot(n_lr, train_results, 'b', label='Training')
line2 = plt.plot(n_lr, test_results, 'r', label='Testing') 
plt.legend(handler_map={'line1':HandlerLine2D(numpoints=2)})
plt.xlabel('n_lr')
plt.ylabel('accuracy')
plt.show()

# That's all for today's Co-Learning session and hope this session was useful for you all!