In [None]:
import zipfile
import time
import multiprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import plot_roc_curve
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, IsolationForest, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Preparing dataset

Data Set Information:

Three data sets are submitted, for training and testing. Ground-truth occupancy was obtained from time stamped pictures that were taken every minute.

Attribute Information:

- date time year-month-day hour:minute:second
- Temperature, in Celsius
- Relative Humidity, %
- Light, in Lux
- CO2, in ppm
- Humidity Ratio, Derived quantity from temperature and relative humidity, in kgwater-vapor/kg-air
- Occupancy, 0 or 1, 0 for not occupied, 1 for occupied status

Dataset available at: https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00357/occupancy_data.zip" -O "/content/drive/MyDrive/Colab Notebooks/occupancy_data.zip"

In [None]:
zf = zipfile.ZipFile('/content/drive/MyDrive/Colab Notebooks/occupancy_data.zip')
zf.filelist

In [None]:
dataset = pd.concat([
                     pd.read_csv(zf.open('datatest.txt')), 
                     pd.read_csv(zf.open('datatest2.txt')), 
                     pd.read_csv(zf.open('datatraining.txt')),
                     ]).reset_index(drop=True)
dataset

In [None]:
dataset = dataset.sample(frac=1).reset_index(drop=True) # Shuffle the dataset

In [None]:
dataset.drop("date", axis=1, inplace=True)
dataset

In [None]:
dataset.info()

In [None]:
dataset["Occupancy"].value_counts()

In [None]:
x, y = dataset.drop("Occupancy", axis=1).to_numpy(), dataset["Occupancy"].to_numpy()
x.shape, y.shape

In [None]:
# x_scaled = MinMaxScaler().fit_transform(x)
# x_scaled

In [None]:
print(np.sum(np.isnan(x)))
print(np.sum(pd.isna(x)))
print(np.sum(pd.isnull(x)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Decision Tree

Sample run of decision tree:

In [None]:
dec_tree = DecisionTreeClassifier(criterion="gini", min_samples_split=20, max_features=None)
cv_result = cross_val_score(dec_tree, X_train, y_train, scoring="accuracy", cv=5)

print(cv_result)
print(np.mean(cv_result))

Let's find optimum hyperparameters of decision tree:

In [None]:
temp_start_time = time.time()

gs_dt = GridSearchCV(estimator=DecisionTreeClassifier(),
                  param_grid={
                      "criterion": ["gini", "entropy"], 
                      "splitter": ["best", "random"], 
                      "max_features": np.linspace(0.1, 1., 10), 
                      "min_samples_split": [2**i for i in range(8)], 
                      "max_depth": [2**i for i in range(4)], 
                      # "min_impurity_split": np.linspace(0.1, 1., 10), 
                      # "min_samples_leaf": [2**i for i in range(4)],
                      # "max_leaf_nodes": [2**i for i in range(6)],
                  }, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)

gs_dt.fit(X_train, y_train)

duration_dt = time.time() - temp_start_time

In [None]:
gs_dt.best_estimator_

In [None]:
# Mean cross-validated score of the best_estimator
gs_dt.best_score_

In [None]:
gs_dt.best_estimator_.score(X_test, y_test)

In [None]:
pd.DataFrame.from_dict(gs_dt.cv_results_)

In [None]:
# fig, ax = plt.subplots(figsize=(50, 50))
# plot_tree(gs_dt.best_estimator_, feature_names=dataset.columns[:-1], class_names=["pos", "neg"], impurity=True, ax=ax)

In [None]:
print("Feature importances:\n")

for feature, importance in reversed(sorted(zip(dataset.columns[:-1], gs_dt.best_estimator_.feature_importances_), key=lambda k: k[1])):
  print(feature, "=", importance)

In [None]:
gs_dt.best_estimator_.predict_proba([X_test[0]])

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_dt.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

## Pruning the tree

Cost complexity pruning provides another option to control the size of a tree. Greater values of ccp_alpha increase the number of nodes pruned. 

Minimal cost complexity pruning recursively finds the node with the “weakest link”. The weakest link is characterized by an effective alpha, where the nodes with the smallest effective alpha are pruned first.

In [None]:
path = dec_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()

ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")

ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

plt.show()

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]

fig, ax = plt.subplots(2, 1)

ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")

ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")

fig.tight_layout()

plt.show()

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots()

ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")

ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")

ax.legend()
plt.show()

# Bagging

## Random Forest & Extra Trees

An ensemble of trees, where at each node only a random subset of the features is considered for splitting and search among elements of the subset to find the best threshold for each feature.

In [None]:
rf_accs = []
for n_estimators in range(1, 51):
  score = cross_val_score(RandomForestClassifier(n_estimators=n_estimators), X_train, y_train, scoring="accuracy", cv=5).mean()
  rf_accs.append(score)

Extra Trees consider a random threshold for each feature rather than searching for the best possible thresholds. So it's faster than random forest because searching for the best threshold is too time-consuming!

In [None]:
et_accs = []
for n_estimators in range(1, 51):
  score = cross_val_score(ExtraTreesClassifier(n_estimators=n_estimators), X_train, y_train, scoring="accuracy", cv=5).mean()
  et_accs.append(score)

In [None]:
plt.plot(range(1, len(rf_accs)+1), rf_accs, label="RandomForest")
plt.plot(range(1, len(et_accs)+1), et_accs, label="ExtraTrees")

plt.xlabel("n_estimators")
plt.ylabel("accuracy")

plt.legend()
plt.show()

In [None]:
temp_start_time = time.time()

gs_rf = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, oob_score=True),
                  param_grid={
                      "n_estimators": [2, 5, 10, 20, 40], 
                      "criterion": ["gini", "entropy"], 
                      "max_features": np.linspace(0.1, 1., 10), 
                      "min_samples_split": [2**i for i in range(1, 8)], 
                      "max_depth": [2**i for i in range(4)], 
                      # "min_impurity_split": np.linspace(0.1, 1., 10), 
                      # "min_samples_leaf": [2**i for i in range(4)], 
                      # "max_leaf_nodes": [2**i for i in range(2, 6)], 
                  }, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)

gs_rf.fit(X_train, y_train)

duration_rf = time.time() - temp_start_time

In [None]:
gs_rf.best_estimator_

In [None]:
gs_rf.best_score_

In [None]:
gs_rf.best_estimator_.oob_score_

In [None]:
pd.DataFrame.from_dict(gs_rf.cv_results_)

In [None]:
print("Feature importances:\n")

for feature, importance in reversed(sorted(zip(dataset.columns[:-1], gs_rf.best_estimator_.feature_importances_), key=lambda k: k[1])):
  print(feature, "=", importance)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_rf.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

In [None]:
temp_start_time = time.time()

gs_et = GridSearchCV(estimator=ExtraTreesClassifier(n_jobs=-1, bootstrap=True, oob_score=True),
                  param_grid={
                      "n_estimators": [2, 5, 10, 20, 40], 
                      "criterion": ["gini", "entropy"], 
                      "max_features": np.linspace(0.1, 1., 10), 
                      "min_samples_split": [2**i for i in range(8)], 
                      "max_depth": [2**i for i in range(4)], 
                      # "min_impurity_split": np.linspace(0.1, 1., 10), 
                      # "min_samples_leaf": [2**i for i in range(4)], 
                      # "max_leaf_nodes": [2**i for i in range(6)], 
                  }, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)

gs_et.fit(X_train, y_train)

duration_et = time.time() - temp_start_time

In [None]:
gs_et.best_estimator_

In [None]:
gs_et.best_score_

In [None]:
gs_et.best_estimator_.oob_score_

In [None]:
pd.DataFrame.from_dict(gs_et.cv_results_)

In [None]:
print("Feature importances:\n")

for feature, importance in reversed(sorted(zip(dataset.columns[:-1], gs_et.best_estimator_.feature_importances_), key=lambda k: k[1])):
  print(feature, "=", importance)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_et.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

## Voting Classifier

In [None]:
lr = LogisticRegression()
svc_pipe = Pipeline([('scale', MinMaxScaler()), 
                     ('svc', SVC(kernel="poly", probability=True))])
dt = DecisionTreeClassifier()

classifiers = [
               ("lr", lr),
               ("dt", dt),
               ("svc", svc_pipe),
]

# vc_clf = VotingClassifier(estimators=classifiers, voting="hard", n_jobs=-1)
vc_clf = VotingClassifier(estimators=classifiers, voting="soft", weights=[1.5, .5, .75], n_jobs=-1)

In [None]:
for classifier in classifiers:
  print(classifier[0], cross_val_score(classifier[1], X_train, y_train, scoring="accuracy", cv=5).mean())

print("vc", cross_val_score(vc_clf, X_train, y_train, scoring="accuracy", cv=5).mean())

In [None]:
temp_start_time = time.time()

gs_vc = GridSearchCV(estimator=VotingClassifier(estimators=[
                                                            ("lr", LogisticRegression()), 
                                                            ("dt", DecisionTreeClassifier()), 
                                                            ("svc", make_pipeline(MinMaxScaler(), 
                                                                                  SVC(kernel="poly", probability=True))),
                                                            ], n_jobs=-1),
                  param_grid={
                      "voting": ["hard", "soft"], 
                      "weights": [[1., 1., 1.], [.5, .5, .5], [1.5, .5, .75], [5.5, .5, 1.5]],
                  }, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)

gs_vc.fit(X_train, y_train)

duration_vc = time.time() - temp_start_time

In [None]:
gs_vc.best_estimator_

In [None]:
gs_vc.best_score_

In [None]:
pd.DataFrame.from_dict(gs_vc.cv_results_)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_vc.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

## Bagging Classifier

BaggingClassifier with Decision tree as base estimator:

In [None]:
bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), 
                            n_estimators=100, 
                            max_samples=100, 
                            bootstrap=True, 
                            n_jobs=-1)
cv_result = cross_val_score(bag_clf, X_train, y_train, scoring="accuracy", cv=5)

print(cv_result)
print(np.mean(cv_result))

BaggingClassifier with SVM as base estimator:

In [None]:
bag_clf = BaggingClassifier(base_estimator=SVC(), 
                            n_estimators=100, 
                            max_samples=100, 
                            bootstrap=True, 
                            n_jobs=-1)
cv_result = cross_val_score(bag_clf, X_train, y_train, scoring="accuracy", cv=5)

print(cv_result)
print(np.mean(cv_result))

In [None]:
temp_start_time = time.time()

gs_bag = GridSearchCV(estimator=BaggingClassifier(n_jobs=-1, bootstrap=True, oob_score=True),
                  param_grid={
                      "base_estimator": [DecisionTreeClassifier(), SVC(), LogisticRegression()], 
                      "n_estimators": [50, 100, 200, 500, 1000],
                      "max_features": np.linspace(0.1, 1., 10), 
                      # "max_samples": [2**i for i in range(8)], 
                      "bootstrap_features": [True, False], 
                  }, scoring="accuracy", cv=10, n_jobs=-1, verbose=1, pre_dispatch=1)

gs_bag.fit(X_train, y_train)

duration_bag = time.time() - temp_start_time

In [None]:
gs_bag.best_estimator_

In [None]:
gs_bag.best_score_

In [None]:
gs_bag.best_estimator_.oob_score_

In [None]:
pd.DataFrame.from_dict(gs_bag.cv_results_)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_bag.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

# Boosting

## AdaBoost

In [None]:
ab_accs = []
for n_estimators in range(1, 51):
  score = cross_val_score(AdaBoostClassifier(n_estimators=n_estimators, learning_rate=0.1), 
                          X_train, y_train, 
                          scoring="accuracy", cv=5).mean()
  ab_accs.append(score)

In [None]:
plt.plot(range(1, len(ab_accs)+1), ab_accs)

plt.xlabel("n_estimators")
plt.ylabel("accuracy")

plt.legend()
plt.show()

In [None]:
temp_start_time = time.time()

gs_ab = GridSearchCV(estimator=AdaBoostClassifier(),
                  param_grid={
                      "n_estimators": [2, 5, 10, 20, 40],
                      "learning_rate": np.linspace(0.1, 1., 10),
                      "algorithm": ["SAMME", "SAMME.R"], 
                  }, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)

gs_ab.fit(X_train, y_train)

duration_ab = time.time() - temp_start_time

In [None]:
gs_ab.best_estimator_

In [None]:
gs_ab.best_score_

In [None]:
pd.DataFrame.from_dict(gs_ab.cv_results_)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_ab.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

## GradientBoosting

In [None]:
gb_accs = []
for n_estimators in range(1, 51):
  score = cross_val_score(GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=10/n_estimators), 
                          X_train, y_train, 
                          scoring="accuracy", cv=5).mean()
  gb_accs.append(score)

In [None]:
plt.plot(range(1, len(gb_accs)+1), gb_accs)

plt.xlabel("n_estimators")
plt.ylabel("accuracy")

plt.show()

In [None]:
temp_start_time = time.time()

gs_gb = GridSearchCV(estimator=GradientBoostingClassifier(),
                  param_grid={
                      "loss": ["deviance", "exponential"], 
                      "n_estimators": [2, 5, 10, 20, 40],
                      "learning_rate": np.linspace(0.1, 1., 10),
                      "subsample": np.linspace(0.1, 1., 10),
                      "criterion": ["friedman_mse", "mse", "mae"],
                      "max_features": np.linspace(0.1, 1., 10), 
                      # "min_samples_split": [2**i for i in range(8)], 
                      "max_depth": [2**i for i in range(4)], 
                      # "min_impurity_split": np.linspace(0.1, 1., 10), 
                      # "min_samples_leaf": [2**i for i in range(4)],
                  }, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)

gs_gb.fit(X_train, y_train)

duration_gb = time.time() - temp_start_time

In [None]:
gs_gb.best_estimator_

In [None]:
gs_gb.best_score_

In [None]:
pd.DataFrame.from_dict(gs_gb.cv_results_)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_gb.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()

# Isolation Forest

# Stacking Classifier

# XGBoost

# Results

In [None]:
pd.DataFrame([
              ["Decision Tree",       "No",   duration_dt,  gs_dt.best_score_,  "-"],
              ["Random Forest",       "Yes",  duration_rf,  gs_rf.best_score_,  gs_rf.best_estimator_.oob_score_],
              ["Extra Trees",         "Yes",  duration_et,  gs_et.best_score_,  gs_et.best_estimator_.oob_score_],
              ["Bagging Classifier",  "Yes",  duration_bag, gs_bag.best_score_, gs_bag.best_estimator_.oob_score_],
              ["Voting classifier",   "Yes",  duration_vc,  gs_vc.best_score_,  "-"],
              ["AdaBoost",            "No",   duration_ab,  gs_ab.best_score_,  "-"],
              ["GradientBoosting",    "No",   duration_gb,  gs_gb.best_score_,  "-"],
              ["Isolation Forest",    "-",    "-",          "-",                "-"],
              ["Stacking Classifier", "-",    "-",          "-",                "-"],
              ], columns=["Method", "Bagging/Pasting?", "duration", "score", "oob score (bagging only)"])

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

plot_roc_curve(gs_dt.best_estimator_, X_test, y_test, ax=ax)
plot_roc_curve(gs_rf.best_estimator_, X_test, y_test, ax=ax)
plot_roc_curve(gs_et.best_estimator_, X_test, y_test, ax=ax)
plot_roc_curve(gs_vc.best_estimator_, X_test, y_test, ax=ax)
plot_roc_curve(gs_bag.best_estimator_, X_test, y_test, ax=ax)
plot_roc_curve(gs_ab.best_estimator_, X_test, y_test, ax=ax)
plot_roc_curve(gs_gb.best_estimator_, X_test, y_test, ax=ax)

ax.plot([0, 1], [0, 1], "r--")

plt.show()