# Imports

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from IPython.display import clear_output
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris, load_digits

# Constants

In [None]:
SEED = 42

# Loading the dataset

In [None]:
! rm -rf *
! wget https://archive.ics.uci.edu/static/public/53/iris.zip
! unzip iris.zip
! rm iris.zip Index iris.names
clear_output()

In [None]:
# Another way to load the dataset
# X, y = load_iris(as_frame=True)['data'], load_iris(as_frame=True)['target']
df = pd.read_csv('bezdekIris.data', names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])

# Preparing the dataset

In [None]:
def data_preparation_before_split(df: pd.DataFrame, is_test: bool):
    df_temp = df.copy()
    parameters = dict()

    if is_test:
        return df_temp

    X = df_temp.drop(columns='class')
    y = df_temp['class']
    parameters['label_encoder'] = LabelEncoder().fit(y)
    y = parameters['label_encoder'].transform(y)

    return X, y, parameters

def data_preparation_after_split(X: pd.DataFrame, y: pd.DataFrame, is_train: bool, parameters: dict = dict()):
    """
    parameters dict may have these keys:
    scaler, ordinal_encoder, label_encoder, mapping_dicts,
    categorical_imputer, numerical_imputer, etc...
    """
    X_temp = X.copy()
    y_temp = y.copy() if is_train else None


    # scale
    if is_train:
        parameters['scaler'] = StandardScaler().fit(X_temp)
    X_temp = pd.DataFrame(parameters['scaler'].transform(X_temp), columns=parameters['scaler'].feature_names_in_)

    return (X_temp, y_temp, parameters) if is_train else X_temp



X, y, parameters = data_preparation_before_split(df, is_test=False)

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3, random_state=SEED)

X_train, y_train, _  = data_preparation_after_split(X=X_train, y=y_train, is_train=True)
parameters.update(_)
X_dev = data_preparation_after_split(X=X_dev, y=None, is_train=False, parameters=parameters)

# Comparing models with their default hyperparameters

In [None]:
classifiers = {
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=SEED),
    'Random Forest': RandomForestClassifier(random_state=SEED, n_jobs=-1),
    'Neural network': MLPClassifier(random_state=SEED)
}

for model_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    train_y_pred = clf.predict(X_train)
    validation_y_pred = clf.predict(X_dev)
    train_accuracy = accuracy_score(y_train, train_y_pred)
    validation_accuracy = accuracy_score(y_dev, validation_y_pred)
    train_classification_report = classification_report(y_train, train_y_pred)
    validation_classification_report = classification_report(y_dev, validation_y_pred)
    print(f"\n\n{model_name} model:\nTrain Accuracy: {train_accuracy:.4f}, Validation Accuracy: {validation_accuracy:.4f}\nTrain Classification Report:\n{train_classification_report}\nValidation Classification Report:\n{validation_classification_report}")




Naive Bayes model:
Train Accuracy: 0.9429, Validation Accuracy: 0.9778
Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       0.92      0.92      0.92        37
           2       0.92      0.92      0.92        37

    accuracy                           0.94       105
   macro avg       0.95      0.95      0.95       105
weighted avg       0.94      0.94      0.94       105

Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



Decision Tree model:
Train Accuracy: 1.0000, Validation Accuracy: 1.0000
Train Classification Report



# Hyperparameter tuning

In [None]:
models_param_grid = {
    'Naive Bayes': {
        'var_smoothing': np.logspace(0, -9, num=100)
    },
    'Decision Tree': {
        'max_features': ['sqrt', 'log2'],
        'max_depth' : [5, 7, 9],
        'criterion' : ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': np.arange(20, 100, 10),
        'max_features': np.arange(0.5, 1.0, 0.1),
        'max_depth': np.arange(1, 20, 5)
    },
    'Neural network': {
        'solver': ['lbfgs', 'adam', 'sgd'],
        'max_iter': [1000, 1500, 2000],
        'alpha': 10.0 ** -np.arange(2, 6),
        'hidden_layer_sizes': 2 ** np.arange(3, 8)
    }
}
for model_name, param_grid in models_param_grid.items():
    grid_search_result = GridSearchCV(estimator=classifiers[model_name], param_grid=param_grid, n_jobs=-1, cv=5, scoring="accuracy")
    grid_search_result.fit(X_train, y_train)
    print(f"\n\n{model_name} model:\nBest Hyperparameters: {grid_search_result.best_params_}\nMean cross-validated acuuracy score: {grid_search_result.best_score_:.4f}")



Naive Bayes model:
Best Hyperparameters: {'var_smoothing': 0.03511191734215131}
Mean cross-validated acuuracy score: 0.9333


Decision Tree model:
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt'}
Mean cross-validated acuuracy score: 0.9143


Random Forest model:
Best Hyperparameters: {'max_depth': 1, 'max_features': 0.5, 'n_estimators': 20}
Mean cross-validated acuuracy score: 0.9429


Neural network model:
Best Hyperparameters: {'alpha': 0.01, 'hidden_layer_sizes': 8, 'max_iter': 1000, 'solver': 'lbfgs'}
Mean cross-validated acuuracy score: 0.9524


# Testing on another dataset: Optical recognition of handwritten digits

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(load_digits(as_frame=True)['data'], load_digits(as_frame=True)['target'], test_size=0.2, random_state=SEED)

X_train, y_train, _  = data_preparation_after_split(X=X_train, y=y_train, is_train=True)
parameters.update(_)
X_dev = data_preparation_after_split(X=X_dev, y=None, is_train=False, parameters=parameters)

In [None]:
for model_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    train_y_pred = clf.predict(X_train)
    validation_y_pred = clf.predict(X_dev)
    train_accuracy = accuracy_score(y_train, train_y_pred)
    validation_accuracy = accuracy_score(y_dev, validation_y_pred)
    train_classification_report = classification_report(y_train, train_y_pred)
    validation_classification_report = classification_report(y_dev, validation_y_pred)
    print(f"\n\n{model_name} model:\nTrain Accuracy: {train_accuracy:.4f}, Validation Accuracy: {validation_accuracy:.4f}\nTrain Classification Report:\n{train_classification_report}\nValidation Classification Report:\n{validation_classification_report}")




Naive Bayes model:
Train Accuracy: 0.7996, Validation Accuracy: 0.7667
Train Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       145
           1       0.78      0.73      0.75       154
           2       0.99      0.49      0.65       144
           3       0.99      0.72      0.84       149
           4       0.99      0.59      0.74       135
           5       0.84      0.94      0.89       135
           6       0.89      0.99      0.94       146
           7       0.61      0.99      0.76       145
           8       0.54      0.85      0.66       144
           9       0.91      0.70      0.79       140

    accuracy                           0.80      1437
   macro avg       0.85      0.80      0.80      1437
weighted avg       0.85      0.80      0.80      1437

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       

In [None]:
for model_name, param_grid in models_param_grid.items():
    grid_search_result = GridSearchCV(estimator=classifiers[model_name], param_grid=param_grid, n_jobs=-1, cv=5, scoring="accuracy")
    grid_search_result.fit(X_train, y_train)
    print(f"\n\n{model_name} model:\nBest Hyperparameters: {grid_search_result.best_params_}\nMean cross-validated acuuracy score: {grid_search_result.best_score_:.4f}")



Naive Bayes model:
Best Hyperparameters: {'var_smoothing': 0.0657933224657568}
Mean cross-validated acuuracy score: 0.8956


Decision Tree model:
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 9, 'max_features': 'sqrt'}
Mean cross-validated acuuracy score: 0.7975


Random Forest model:
Best Hyperparameters: {'max_depth': 16, 'max_features': 0.5, 'n_estimators': 60}
Mean cross-validated acuuracy score: 0.9617


Neural network model:
Best Hyperparameters: {'alpha': 0.01, 'hidden_layer_sizes': 64, 'max_iter': 1000, 'solver': 'adam'}
Mean cross-validated acuuracy score: 0.9770
