In [None]:
import numpy as np
import pandas as pd
import pathlib
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt

In [None]:
path = pathlib.Path.cwd().parent
path = path / "input" / "water-potability"
df = pd.read_csv(path / "water_potability.csv")
df.dropna(inplace=True)
keys = df.columns

In [None]:
df.info()

In [None]:
x, y = df[keys[:-1]], df[keys[-1]]
x, y =x.to_numpy(), y.to_numpy()

In [None]:
# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, shuffle=True, random_state=98)
print('Data for Modeling: ' + str(x_train.shape[0]))
print('Unseen Data For Predictions: ' + str(x_test.shape[0]))


In [None]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)

gb = GradientBoostingClassifier(n_estimators=1000)
gb.fit(x_train, y_train)

ab = AdaBoostClassifier(n_estimators=1000)
ab.fit(x_train, y_train)

dt = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
dt.fit(x_train, y_train)

et = ExtraTreesClassifier(n_estimators=1000)
et.fit(x_train, y_train)

In [None]:
print("\nPredictions of Random Forest algorithm:")
print(f"\tTraining Accuracy : {100 * rf.score(x_train, y_train):.3g}%")
print(f"\tTesting Accuracy : {100 * rf.score(x_test, y_test):.3g}%")

print("\nPredictions of Gradient boosting algorithm:")
print(f"\tTraining Accuracy : {100 * gb.score(x_train, y_train):.3g}%")
print(f"\tTesting Accuracy : {100 * gb.score(x_test, y_test):.3g}%")

print("\nPredictions of Adaptative Boosting algorithm:")
print(f"\tTraining Accuracy : {100 * ab.score(x_train, y_train):.3g}%")
print(f"\tTesting Accuracy : {100 * ab.score(x_test, y_test):.3g}%")

print("\nPredictions of Decision Tree algorithm:")
print(f"\tTraining Accuracy : {100 * dt.score(x_train, y_train):.3g}%")
print(f"\tTesting Accuracy : {100 * dt.score(x_test, y_test):.3g}%")

print("\nPredictions of Extra Trees algorithm:")
print(f"\tTraining Accuracy : {100 * et.score(x_train, y_train):.3g}%")
print(f"\tTesting Accuracy : {100 * et.score(x_test, y_test):.3g}%")


# Hypertuning

In [None]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(et.get_params())

In [None]:
# Number of estimators
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


criterion = ["gini", "entropy"]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
et = ExtraTreesClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
et_random = RandomizedSearchCV(estimator = et, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
et_random.fit(x_train, y_train)

In [None]:
pprint(et_random.best_params_)

In [None]:
et = ExtraTreesClassifier(bootstrap=False, max_depth=29, max_features="log2",
                          min_samples_leaf=1, min_samples_split=5, 
                          n_estimators=2000, random_state=42, criterion="gini")
et.fit(x_train, y_train)

print("\nPredictions of Extra Trees algorithm:")
print(f"\tTraining Accuracy : {100 * et.score(x_train, y_train):.3g}%")
print(f"\tTesting Accuracy : {100 * et.score(x_test, y_test):.3g}%")