## Downloading source code from the github

In [1]:
!git clone https://github.com/shivaditya-meduri/ensembleLearning.git

Cloning into 'ensembleLearning'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (96/96), done.[K
remote: Total 133 (delta 60), reused 90 (delta 34), pack-reused 0[K
Receiving objects: 100% (133/133), 331.27 KiB | 1.76 MiB/s, done.
Resolving deltas: 100% (60/60), done.


## Decision Tree Classifier - Implemented from scratch
Using the gini-impurity cost function to split the data by feature and rank them based on importance, we created a Decision Tree classifier from scratch. We included 2 hyper-parameters which are max depth of the tree and the minimum samples count per leaf to tune the model. Below, we tested the model on 2 datasets which are the breast cancer dataset and the iris dataset for the task of classification

#### Testing on Breast Cancer dataset

In [2]:
## Testing on breast cancer dataset which predcits if a sample to "Benign" or "Malignant" case of cancer
import pandas as pd
from ensembleLearning.src.decisionTree import decisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv("ensembleLearning/data/bcan.csv")
X = data.drop(["diagnosis", "id"], axis=1).values
y = data["diagnosis"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
dt = decisionTree(max_depth=50, min_samples_leaf=1)
dt.train(X_train, y_train)
ypred = dt.predict(X_test)
print("Accuracy on the breast cancer dataset is ", accuracy_score(ypred, y_test))

Accuracy on the breast cancer dataset is  0.9298245614035088


Hyper-Parameter tuning for breast cancer dataset:

In [6]:
import pandas as pd
from ensembleLearning.src.decisionTree import decisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv("ensembleLearning/data/bcan.csv")
X = data.drop(["diagnosis", "id"], axis=1).values
y = data["diagnosis"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# hyper params
max_depth = [30, 40, 50, 60]
min_samples_leaf = [1, 2, 3, 4]
threshold = [0.01, 0.03, 0.1]
results = {}
for depth in max_depth:
    for sample in min_samples_leaf:
        for thresh in threshold:          
          dt = decisionTree(max_depth=depth, min_samples_leaf=sample, threshold=thresh)
          dt.train(X_train, y_train)
          ypred = dt.predict(X_test)
          results[(depth, sample, thresh)] = accuracy_score(ypred, y_test)

accuracies = list(results.values())
params = list(results.keys())
best_params = params[accuracies.index(max(accuracies))]

print("Best accuracy = {}. Model params: (max_depth={}, min_samples_leaf={}, threshold={})".format(max(accuracies), best_params[0], best_params[1], best_params[2]))

Best accuracy = 0.9473684210526315. Model params: (max_depth=30, min_samples_leaf=1, threshold=0.01)


#### Testing on Iris dataset

In [9]:
## Testing on iris dataset which classifies flower physical charecteristics to type of flower : Setosa, Verginica and Versicolor
import pandas as pd
from ensembleLearning.src.decisionTree import decisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv("ensembleLearning/data/iris.csv")
X = data.drop(["variety"], axis=1).values
y = data["variety"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
dt = decisionTree(type="classification", max_depth=100, min_samples_leaf=1)
dt.train(X_train, y_train)
ypred = dt.predict(X_test)
print("Accuracy on iris dataset is ", accuracy_score(ypred, y_test))

Accuracy on iris dataset is  1.0


Hyper-Parameter tuning on iris dataset:

In [11]:
import pandas as pd
from ensembleLearning.src.decisionTree import decisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv("ensembleLearning/data/iris.csv")
X = data.drop(["variety"], axis=1).values
y = data["variety"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

max_depth = [3, 10, 50, 100]
min_samples_leaf = [1, 2, 3, 4]
threshold = [0.01, 0.03, 0.1]
results = {}
for depth in max_depth:
    for sample in min_samples_leaf:
        for thresh in threshold:
          dt = decisionTree(max_depth=depth, min_samples_leaf=sample, threshold=thresh)
          dt.train(X_train, y_train)
          ypred = dt.predict(X_test)
          results[(depth, sample, thresh)] = accuracy_score(ypred, y_test)

accuracies = list(results.values())
params = list(results.keys())
best_params = params[accuracies.index(max(accuracies))]

print("Best accuracy = {}. Model params: (max_depth={}, min_samples_leaf={}, threshold={})".format(max(accuracies), best_params[0], best_params[1], best_params[2]))

Best accuracy = 0.9333333333333333. Model params: (max_depth=3, min_samples_leaf=4, threshold=0.01)


### Decision Tree Regressor

Using variance as cost function to split the data by feature and rank them by importance, and then using the average of all the labels in the leaf node, we make a prediction for a given set of features after training through traversal of the binary tree created. 

In [None]:
import pandas as pd
from ensembleLearning.src.decisionTree import decisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
reg_data = pd.read_csv("ensembleLearning/data/regression_housing.csv")[["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "SalePrice"]]
X = reg_data.drop(["SalePrice"], axis=1).values
y = reg_data["SalePrice"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
dt = decisionTree(type = "regression")
dt.train(X_train, y_train)
ypred = dt.predict(X_test)
print("Root Mean Squared Error of the model is : ", math.sqrt(mean_squared_error(ypred, y_test)))

Root Mean Squared Error of the model is :  53117.38009733844


Hyper-Parameter tuning for regression tree on housing dataset:

In [12]:
import pandas as pd
from ensembleLearning.src.decisionTree import decisionTree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
reg_data = pd.read_csv("ensembleLearning/data/regression_housing.csv")[["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "SalePrice"]]
X = reg_data.drop(["SalePrice"], axis=1).values
y = reg_data["SalePrice"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

max_depth = [3, 10, 50, 100]
min_samples_leaf = [1, 2, 3, 4]
threshold = [0.01, 0.03, 0.1]
results = {}
for depth in max_depth:
    for sample in min_samples_leaf:
        for thresh in threshold:   
          dt = decisionTree(type = "regression", max_depth=depth, min_samples_leaf=sample, threshold=thresh)
          dt.train(X_train, y_train)
          ypred = dt.predict(X_test)
          results[(depth, sample, thresh)] = math.sqrt(mean_squared_error(ypred, y_test))


errors = list(results.values())
params = list(results.keys())
best_params = params[errors.index(min(errors))]

print("Lowest root mean squared error = {}. Model params: (max_depth={}, min_samples_leaf={}, threshold={})".format(min(errors), best_params[0], best_params[1], best_params[2]))

Lowest root mean squared error = 50450.84499744902. Model params: (max_depth=10, min_samples_leaf=1, threshold=0.01)


### Random Forest Classifier
Using bagging method, we created an ensemble of Decision Tree classifiers and used a voting mechnaism to decide what the class of a set of features will be.

In [None]:
import pandas as pd
from ensembleLearning.src.randomForest import randomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv("ensembleLearning/data/iris.csv")
X = data.drop(["variety"], axis=1).values
y = data["variety"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
rf = randomForest(type = "classification", n_trees=50, max_depth=100, min_samples_leaf=1)
rf.train(X_train, y_train)
ypred = rf.predict(X_test)
print("Accuracy on iris dataset using a random forest model is ", accuracy_score(ypred, y_test))

Accuracy on iris dataset using a random forest model is  1.0


Hyper-Parameter tuning for Random Forest on iris dataset:

In [9]:
import pandas as pd
from ensembleLearning.src.randomForest import randomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv("ensembleLearning/data/iris.csv")
X = data.drop(["variety"], axis=1).values
y = data["variety"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

n_trees = [10, 30, 50 ,100]
max_depth = [3, 10, 50, 100]
min_samples_leaf = [1, 2, 3, 4]

results = {}
for tree in n_trees:
  for depth in max_depth:
      for sample in min_samples_leaf:
          rf = randomForest(type = "classification", n_trees=tree, max_depth=depth, min_samples_leaf=sample)
          rf.train(X_train, y_train)
          ypred = rf.predict(X_test)
          results[(tree, depth, sample)] = accuracy_score(ypred, y_test)

accuracies = list(results.values())
params = list(results.keys())
best_params = params[accuracies.index(max(accuracies))]

print("Best accuracy = {}. Model params: (n_trees={}, max_depth={}, min_samples_leaf={})".format(max(accuracies), best_params[0], best_params[1], best_params[2]))

Best accuracy = 0.9333333333333333. Model params: (n_trees=10, max_depth=3, min_samples_leaf=1)


### Random Forest Regressor
Using Decision Tree Regressor base regression model, an ensemble method using bagging is created which is nothing but the Random Forest Regressor. Instead of voting mechanism like in the case of Random Forest Classifier, we take the average of all the predictions by all the base estimators

In [None]:
import pandas as pd
from ensembleLearning.src.randomForest import randomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
reg_data = pd.read_csv("ensembleLearning/data/regression_housing.csv")[["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "SalePrice"]]
X = reg_data.drop(["SalePrice"], axis=1).values
y = reg_data["SalePrice"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
rf = randomForest(type = "regression", n_trees=50, max_depth=100, min_samples_leaf=1)
rf.train(X_train, y_train)
ypred = rf.predict(X_test)
print("Root Mean Squared Error of the model is : ", math.sqrt(mean_squared_error(ypred, y_test)))

Root Mean Squared Error of the model is :  52611.77811114424


In [13]:
import pandas as pd
from ensembleLearning.src.randomForest import randomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
reg_data = pd.read_csv("ensembleLearning/data/regression_housing.csv")[["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "SalePrice"]]
X = reg_data.drop(["SalePrice"], axis=1).values
y = reg_data["SalePrice"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

n_trees = [10, 30, 50 ,100]
max_depth = [3, 10, 50, 100]
min_samples_leaf = [1, 2, 3, 4]

results = {}
for tree in n_trees:
  for depth in max_depth:
      for sample in min_samples_leaf:
        rf = randomForest(type = "regression", n_trees=50, max_depth=100, min_samples_leaf=1)
        rf.train(X_train, y_train)
        ypred = rf.predict(X_test)
        results[(tree, depth, sample)] = math.sqrt(mean_squared_error(ypred, y_test))


errors = list(results.values())
params = list(results.keys())
best_params = params[errors.index(min(errors))]

print("Lowest root mean squared error = {}. Model params: (n_trees={}, max_depth={}, min_samples_leaf={})".format(min(errors), best_params[0], best_params[1], best_params[2]))

Lowest root mean squared error = 55173.19230148441. Model params: (n_trees=50, max_depth=3, min_samples_leaf=2)
