## Predication with Different Classification Method to The Dataset

In [1]:
%pip install scikeras




In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.exceptions import ConvergenceWarning
import warnings

from tensorflow.keras.utils import to_categorical

from sklearn.ensemble import IsolationForest

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from scikeras.wrappers import KerasClassifier

# Ensemble Methods
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.decomposition import PCA

from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import os

In [3]:
# Suppress specific warning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

warnings.filterwarnings("ignore")

os.environ["PYTHONWARNINGS"] = "ignore"

In [4]:
# Set max output lines before scrolling
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.instance().display_formatter.formatters['text/plain'].for_type(
    type, lambda obj, p, cycle: p.text(repr(obj)[:10000])
)

<function IPython.lib.pretty._type_pprint(obj, p, cycle)>

### Metrics Generation

In [5]:
#Metrics Calculations

def calculate_metrics(classifier, y_val, y_pred):
    print(f"{classifier} metrics: ")

    print(classification_report(y_val, y_pred))

In [6]:
def train_and_accuracy_gen(model, label, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    calculate_metrics(label, y_test, model.predict(X_test))

In [7]:
class ModelEvaluationPipeline:

    param_grid_logistic_regression = {
        'C': [0.01, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'saga'],
        'penalty': ['l2'],
        'max_iter': [100, 500, 1000]
    }

    param_grid_decission_tree_classifier = {
        'max_depth': [None, 5, 20, 50],
        'min_samples_split': [2, 5, 10, 20],
        'criterion': ['gini', 'entropy'],
    }

    param_grid_random_forest_classifier = {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 50],
        'bootstrap': [True, False],
        'criterion': ['gini', 'entropy']
    }

    param_grid_gaussian_naive_bias = {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
    }

    param_grid_svc = {
        'C': [0.1, 1, 10, 100, 1000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'kernel': ['rbf', 'poly']
    }

    param_grid_knn = {
        'n_neighbors': [100, 500, 700, 900, 1100, 1500],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski'],
        'p': [1, 2]
    }

    param_grid_ada_boost = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.5, 1.0],
        'estimator': [
            DecisionTreeClassifier(max_depth=1),
            DecisionTreeClassifier(max_depth=3)
        ],
    }

    param_grid_xgb = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.3, 0.5],
    }

    param_grid_grad_boost = {
      'n_estimators': [50, 100, 200],
      'learning_rate': [0.01, 0.1, 0.2],
      'max_depth': [3, 5, 7],
      'random_state': [42]
    }

    param_grid_ann = {
        'model__n_neurons': [64],
        'model__activation': ['relu', 'tanh'],
        'epochs': [100, 150],
        'batch_size': [50, 100]
    }

    def __init__(self, file_path):
        self.feature_path = file_path
        self.feature_df = self.get_feture()
        self.X, self.y = self.split_feture_and_target()
        self.y = self.map_zero_to_n() # mapping y zero to number of class to make it usable for some modles i.e. xgaboost
        self.number_of_categories = self.get_number_of_categories()
        self.X_train, self.X_test, self.y_train, self.y_test = self.get_scale_and_test_train_split()

    # data read and processing section
    def remove_outliear(self, feature_df):
        iso = IsolationForest(contamination=0.01, random_state=42)
        outliers = iso.fit_predict(feature_df)
        data_cleaned = feature_df[outliers == 1]

        return data_cleaned

    def get_feture(self):
        feature_df = pd.read_csv(self.feature_path)
        feature_df = feature_df.iloc[:, 1:] # remove index

        return self.remove_outliear(feature_df)

    def split_feture_and_target(self):
        X = self.feature_df.iloc[:, :-1]
        y = self.feature_df.iloc[:, -1]

        return X, y

    def get_scale_and_test_train_split(self):
        #Scaling
        scaler = StandardScaler()
        scaled_fature = scaler.fit_transform(self.X)

        #test train split
        return train_test_split(scaled_fature, self.y, train_size=.20, random_state=42, stratify=self.y)

    def map_zero_to_n(self):
        unique_values = {val: idx for idx, val in enumerate(self.y.unique())}
        y_mapped = self.y.map(unique_values)

        return y_mapped

    def get_number_of_categories(self):
        return len(self.y.unique())

    def onehot_encode(self):
        self.y_train = to_categorical(self.y_train, num_classes = self.number_of_categories)

        print(self.y_train.shape)

    # Cross validation section
    def kfold_cross_validation(self, model, n_splits):
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        kfold_score = cross_val_score(model, self.X, self.y, cv=kf)
        mean_score = np.mean(kfold_score)
        print("K-fold cross-validation scores:", kfold_score)
        print("Mean K-fold cross-validation score:", mean_score)

    def stratified_cross_validation(self, model, n_splits):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        skfold_score = cross_val_score(model, self.X, self.y, cv=skf)
        
        mean_score = np.mean(skfold_score)
        print("Straified cross validation scores:", skfold_score)
        print("Mean Straified cross-validation score:", mean_score)

    def cross_validation(self, model, n_splits):
        self.kfold_cross_validation(model, n_splits)
        self.stratified_cross_validation(model, n_splits)

    # Hyper parameter tuning

    def gridSerach(self, estimator, param_grid):
        print("==== Grid Search: =====")

        grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=3, verbose=0)
        grid_search.fit(self.X_train, self.y_train)

        print("Best parameters found: ", grid_search.best_params_)
        print("Best score found: ", grid_search.best_score_)

        return grid_search

    def randomSearch(self, estimator, param_grid):
        print("\n==== Random Search: =====")

        random_search = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, n_iter=500, cv=3, random_state=42)
        random_search.fit(self.X_train, self.y_train)

        print("Best parameters found: ", random_search.best_params_)
        print("Best score found: ", random_search.best_score_)

        return random_search

    def hyper_parameter_tuning(self, model, param_grid):
        grid_search = self.gridSerach(model, param_grid)
        random_search = self.randomSearch(model, param_grid)

        return grid_search if grid_search.best_score_ > random_search.best_score_ else random_search

    # Models section
    def run_logistic_regression_model(self):
        print("=============== 1. Logistic Regression Section: ==================")

        tuned_model = self.hyper_parameter_tuning(LogisticRegression(), self.param_grid_logistic_regression)
        lrm = tuned_model.best_estimator_

        train_and_accuracy_gen(lrm, "1. Logistic regression", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(lrm, 10)

    def run_decission_tree_classifier_model(self):
        print("=================2. Decission Tree Classifier Section: ================")

        tuned_model = self.hyper_parameter_tuning(DecisionTreeClassifier(), self.param_grid_decission_tree_classifier)
        dt = tuned_model.best_estimator_

        train_and_accuracy_gen(dt, "2. Decission Tree Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(dt, 10)

    def run_random_forest_classifier_model(self):
        print("=================== 3. Random Forest Classifier Section: ==================")

        tuned_model = self.hyper_parameter_tuning(RandomForestClassifier(), self.param_grid_random_forest_classifier)
        rfc = tuned_model.best_estimator_

        train_and_accuracy_gen(rfc, "3.  Random Forest Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(rfc, 10)

    def run_gaussian_naive_bias_classifier_model(self):
        print("=================== 4. Gaussian Naive Bias Classifier Section: ===================")

        tuned_model = self.hyper_parameter_tuning(GaussianNB(), self.param_grid_gaussian_naive_bias)
        gnb = tuned_model.best_estimator_

        train_and_accuracy_gen(gnb, "4. Gaussian Naive Bias Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(gnb, 10)


    def run_support_vector_classifier_model(self):
        print("=================== 5. Support Vector Classifier Section: ===================")

        tuned_model = self.hyper_parameter_tuning(SVC(), self.param_grid_svc)
        svc = tuned_model.best_estimator_

        train_and_accuracy_gen(svc, "5. Support Vector Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(svc, 10)


    def run_knn_classifier_model(self):
        print("=================== 6. K-Nearest Neighbors Classifier Section: ===================")

        tuned_model = self.hyper_parameter_tuning(KNeighborsClassifier(), self.param_grid_knn)
        knn = tuned_model.best_estimator_

        train_and_accuracy_gen(knn, "6. K-Nearest Neighbors", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(knn, 10)

    def run_ada_boost_classifier_model(self):
        print("=================== 7. Ada Boost Classifier Section: ===================")

        tuned_model = self.hyper_parameter_tuning(AdaBoostClassifier(), self.param_grid_ada_boost)
        abc = tuned_model.best_estimator_

        train_and_accuracy_gen(abc, "7. Ada Boost Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(abc, 10)

    def run_xg_boost_classifier_model(self):
        print("=================== 8. XG Boost Classifier Section: ===================")

        tuned_model = self.hyper_parameter_tuning(XGBClassifier(), self.param_grid_xgb)
        xgb = tuned_model.best_estimator_

        train_and_accuracy_gen(xgb, "8. XG Boost Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(xgb, 10)

    def run_gradient_boost_classifier_model(self):
        print("=================== 9. Gradient Boost Classifier Section: ===================")

        tuned_model = self.hyper_parameter_tuning(GradientBoostingClassifier(), self.param_grid_grad_boost)
        gb = tuned_model.best_estimator_

        train_and_accuracy_gen(gb, "9. Gradient Boost Classifier", self.X_train, self.X_test, self.y_train, self.y_test)
        self.cross_validation(gb, 10)

    @staticmethod
    def build_ann(n_neurons=128, activation='relu'):
        model = Sequential()
        # Input layer
        model.add(Dense(n_neurons, activation=activation, input_shape=(24,)))

        model.add(Dense(n_neurons, activation=activation))
        model.add(Dense(n_neurons, activation=activation))
        model.add(Dense(n_neurons, activation=activation))

        # Output layer (example for binary classification)
        model.add(Dense(units=15, activation='softmax'))

        model.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])

        return model

    def run_ann_model(self):
        print("=================== 10. Artificial Neural Net Section: ===================")

        y_train_tmp = self.y_train

        self.onehot_encode()

        model = KerasClassifier(build_fn=self.build_ann, verbose=0, epochs = 50, batch_size = 100)

        tuned_model = self.hyper_parameter_tuning(model, self.param_grid_ann)
        ann = tuned_model.best_estimator_

        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        y_pred_classes = np.argmax(y_pred, axis=1)

        calculate_metrics("10. Artificial Neuralnet", self.y_test, y_pred_classes)
        self.cross_validation(ann, 2)

        self.y_train = y_train_tmp


In [8]:
ROOT = '../data/Processed_Features/'

In [15]:
model_evaluation_pipeline = ModelEvaluationPipeline(ROOT + "W500_O50_features.csv")
#model_evaluation_pipeline.run_ann_model()

In [16]:
model_evaluation_pipeline.run_decission_tree_classifier_model()

==== Grid Search: =====
Best parameters found:  {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}
Best score found:  0.4526228099448337

==== Random Search: =====
Best parameters found:  {'min_samples_split': 2, 'max_depth': None, 'criterion': 'entropy'}
Best score found:  0.4521723611462547
2. Decission Tree Classifier metrics: 
              precision    recall  f1-score   support

           0       0.23      0.30      0.26        44
           1       0.95      0.80      0.87        45
           2       0.42      0.59      0.49        44
           3       0.72      0.70      0.71        44
           4       0.85      0.80      0.82        44
           5       0.85      0.77      0.81        44
           6       0.43      0.36      0.39        45
           7       0.79      0.95      0.87        44
           8       0.51      0.57      0.54        44
           9       0.50      0.41      0.45        44
          10       0.87      0.75      0.80        44
 