### **Import Required Libraries**

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
# for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

In [5]:
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                             confusion_matrix,
                             roc_auc_score)

### **Functions for Input Preparation**

In [6]:
def sequential_split(data, test_prop=0.2):
    # data: either X or y
    # Calculate lengths for train and test splits
    train_length = int(len(data) * (1 - test_prop))
    test_length = len(data) - train_length

    # handles pandas dataframe
    if isinstance(data, pd.DataFrame):
        # Using iloc to handle DataFrame slicing
        train_data = data.iloc[:train_length, :]
        test_data = data.iloc[train_length:, :]
    # handles numpy array
    elif isinstance(data, np.ndarray):
        # Using standard slicing for 1D NumPy arrays
        train_data = data[:train_length]
        test_data = data[train_length:len(data)]
    else:
        raise ValueError("The input data must be a pandas DataFrame or a NumPy array.")

    return train_data, test_data

In [7]:
def min_max_transform(data, feature_range=(0, 1)):
   scaler = MinMaxScaler(feature_range)
   return scaler.fit_transform(data)

In [8]:
def Dataset_Creation(X, y):
    """
    Creates 2D input-output pairs for next-day prediction based on the current day's features.

    Parameters:
    X (pd.DataFrame or np.ndarray): The input features for the current day.
    y (pd.Series or np.ndarray): The target variable (dependent variable) that we want to predict.

    Returns:
    np.ndarray: Input dataset with shape (n_samples, n_features) -> 2D array.
    np.ndarray: Output dataset for the next day with shape (n_samples,).
    """

    X = np.array(X)  # Ensure X is an ndarray
    y = np.array(y)  # Ensure y is an ndarray
    Xs, ys = [], []

    for i in range(len(X) - 1):
        # Use the current day's features (row i)
        Xs.append(X[i])
        # The next day's target (row i+1 in y)
        ys.append(y[i + 1])

    return np.array(Xs), np.array(ys)


### **Loading Data**

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
data_path = '/content/drive/MyDrive/Sebastian_Honor Thesis/business_cycle.csv'
output_path = '/content/drive/MyDrive/Sebastian_Honor Thesis/Results/'

In [11]:
data_original = pd.read_csv(data_path)

# Rename the column
data_original.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)

In [12]:
data_original.head()

Unnamed: 0,Date,SP_500,unemp_rate,fed_funds_rate,man_hrs,con_sent_index,home_price,ind_prod_index,housing_permits,treasury_3,treasury_10,gdp,cpi,unemp_dur,business_cycle
0,1960-01-01,58.029,5.2,3.99,40.6,100.0,1460.0,24.1712,1092.0,0.36,0.73,542.648,-0.340136,13.5,1.0
1,1960-02-01,55.775,4.8,3.97,40.3,100.0,1503.0,23.9561,1088.0,-0.01,0.52,542.648,0.341297,13.1,1.0
2,1960-03-01,55.015217,5.4,3.84,40.0,93.3,1109.0,23.741,955.0,-0.53,0.41,542.648,0.0,13.0,1.0
3,1960-04-01,55.7,5.2,3.92,40.0,93.3,1289.0,23.5528,1016.0,-0.69,0.36,541.08,0.340136,12.6,1.0
4,1960-05-01,55.215238,5.1,3.85,40.1,93.3,1271.0,23.5259,1052.0,-0.56,0.5,541.08,0.0,11.9,0.0


In [13]:
data=data_original.copy()
data=data.drop(['Date'], axis=1)
data.head()

Unnamed: 0,SP_500,unemp_rate,fed_funds_rate,man_hrs,con_sent_index,home_price,ind_prod_index,housing_permits,treasury_3,treasury_10,gdp,cpi,unemp_dur,business_cycle
0,58.029,5.2,3.99,40.6,100.0,1460.0,24.1712,1092.0,0.36,0.73,542.648,-0.340136,13.5,1.0
1,55.775,4.8,3.97,40.3,100.0,1503.0,23.9561,1088.0,-0.01,0.52,542.648,0.341297,13.1,1.0
2,55.015217,5.4,3.84,40.0,93.3,1109.0,23.741,955.0,-0.53,0.41,542.648,0.0,13.0,1.0
3,55.7,5.2,3.92,40.0,93.3,1289.0,23.5528,1016.0,-0.69,0.36,541.08,0.340136,12.6,1.0
4,55.215238,5.1,3.85,40.1,93.3,1271.0,23.5259,1052.0,-0.56,0.5,541.08,0.0,11.9,0.0


### **Input Preparation**

In [14]:
X=data.drop(['business_cycle'],axis=1)
y=np.array(data['business_cycle'])

In [15]:
# Splitting train and test data
X_train, X_test=sequential_split(X, 0.2)
y_train, y_test=sequential_split(y, 0.2)

In [16]:
# Data Set creation
# Since we are predicting next month cycle/value

X_train, y_train =  Dataset_Creation(X_train, y_train)
X_test, y_test =  Dataset_Creation(X_test, y_test)


In [17]:
X_train.shape,y_train.shape,  X_test.shape, y_test.shape

((613, 13), (613,), (153, 13), (153,))

In [18]:
y_train

array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

### **Min/Max normalization**

Since time series data has different characters than regular data, and we are splitting data sequentially. We normalize the train and test set independently.

In [19]:
X_train_scaled=min_max_transform(X_train)
X_test_scaled=min_max_transform(X_test)

## **Building ML Models**

### **Model 1: Logistic Regression**

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
def tune_logistic_model(X_train_scaled, y_train):
    """
    Perform hyperparameter tuning for the Logistic Regression model
    using RandomizedSearchCV.

    Parameters:
    X_train: Training data (features)
    y_train: Training data (target)

    Returns:
    best_model: Logistic Regression model with the best hyperparameters
    best_params: Best hyperparameters found by RandomizedSearchCV
    """

    # Define the parameter grid for Logistic Regression
    param_distributions = {
        'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization technique
        'C': [1, 2, 4],                        # Inverse of regularization strength
        'class_weight': ['balanced', None],          # Handling imbalanced classes
        'solver': ['saga'],                    # Solver for 'l1' and 'elasticnet'
        'l1_ratio': [0.5, 0.7, 0.9]            # Only used with 'elasticnet' penalty
    }


    # Instantiate the Logistic Regression model
    model = LogisticRegression(max_iter=1000)  # Increased max_iter to ensure convergence

    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distributions,
                                       n_iter=10,  # Number of parameter settings sampled
                                       scoring='accuracy',  # Metric for evaluation
                                       cv=5,  # Number of cross-validation folds
                                       verbose=2,  # Show process logs
                                       random_state=42,
                                       n_jobs=-1)  # Use all available cores

    # Fit random search model
    random_search.fit(X_train_scaled, y_train)

    # Best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params

In [22]:
def logistic_regression_model(X_train_scaled,
                              y_train,
                              X_test_scaled,
                              y_test):
    """
    Function to run the Logistic Regression model,
    perform hyperparameter tuning, and record results.

    Parameters:
    X_train: Training features dataset
    y_train: Training target variable
    X_test: Testing features dataset
    y_test: Testing target variable


    Returns:
    accuracy: accuracy score from the best model
    precision: precision score from the best model
    recall: recall (sensitivity) score from the best model
    f1: f1-score from the best model
    best_predictions: (y_pred) Predictions from the best model based on highest f1-score
    best_model_params: Hyperparameters of the best model based on highest f1-score
    """

    # Hyperparameter tuning
    best_model, best_params = tune_logistic_model(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)  # Sensitivity
    f1 = f1_score(y_test, y_pred, zero_division=0)



    return accuracy, precision, recall, f1, y_pred, best_params

In [23]:
lr_accuracy, lr_precision, lr_recall, lr_f1, lr_best, lr_best_model_params=logistic_regression_model(X_train_scaled,
                                                                                                     y_train,
                                                                                                     X_test_scaled,
                                                                                                     y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [24]:
lr_accuracy, lr_precision, lr_recall, lr_f1, lr_best, lr_best_model_params

(0.8888888888888888,
 0.9855072463768116,
 0.9006622516556292,
 0.9411764705882353,
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 {'solver': 'saga',
  'penalty': 'elasticnet',
  'l1_ratio': 0.9,
  'class_weight': None,
  'C': 1})

### **Model 2: Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def tune_randforest_model(X_train_scaled, y_train):
    """
    Perform hyperparameter tuning for the Random Forest model
    using RandomizedSearchCV.

    Parameters:
    X_train: Training data (features)
    y_train: Training data (target)

    Returns:
    best_model: Logistic Regression model with the best hyperparameters
    best_params: Best hyperparameters found by RandomizedSearchCV
    """

    # Define the parameter grid for Random Forest
    # Define the parameter grid for Random Forest
    param_distributions = {
        'n_estimators': [100, 150, 200, 250, 300],          # Number of trees in the forest
        'max_features': ['sqrt', 'log2', None],   # Number of features to consider at each split
        'max_depth': [10, 20, None],              # Maximum depth of the tree
        'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4],            # Minimum number of samples required to be at a leaf node
        'bootstrap': [True, False],               # Whether bootstrap samples are used when building trees
        'class_weight': ['balanced', 'balanced_subsample', None]  # Handling imbalanced classes
    }


    # Instantiate the Logistic Regression model
    model = RandomForestClassifier(n_estimators=1000, max_depth=7, min_samples_leaf=5)

    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distributions,
                                       n_iter=10,  # Number of parameter settings sampled
                                       scoring='accuracy',  # Metric for evaluation
                                       cv=5,  # Number of cross-validation folds
                                       verbose=2,  # Show process logs
                                       random_state=42,
                                       n_jobs=-1)  # Use all available cores

    # Fit random search model
    random_search.fit(X_train_scaled, y_train)

    # Best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params

In [None]:
def random_forest_model(X_train_scaled,
                              y_train,
                              X_test_scaled,
                              y_test):
    """
    Function to run the Logistic Regression model,
    perform hyperparameter tuning, and record results.

    Parameters:
    X_train: Training features dataset
    y_train: Training target variable
    X_test: Testing features dataset
    y_test: Testing target variable


    Returns:
    accuracy: accuracy score from the best model
    precision: precision score from the best model
    recall: recall (sensitivity) score from the best model
    f1: f1-score from the best model
    best_predictions: (y_pred) Predictions from the best model based on highest f1-score
    best_model_params: Hyperparameters of the best model based on highest f1-score
    """

    # Hyperparameter tuning
    best_model, best_params = tune_randforest_model(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)  # Sensitivity
    f1 = f1_score(y_test, y_pred, zero_division=0)



    return accuracy, precision, recall, f1, y_pred, best_params

In [None]:
rf_accuracy, rf_precision, rf_recall, rf_f1, rf_best, rf_best_model_params=random_forest_model(X_train_scaled,
                                                                                                     y_train,
                                                                                                     X_test_scaled,
                                                                                                     y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
rf_accuracy, rf_precision, rf_recall, rf_f1, rf_best, rf_best_model_params

(0.9673202614379085,
 0.9866666666666667,
 0.9801324503311258,
 0.9833887043189369,
 array([1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 {'n_estimators': 200,
  'min_samples_split': 5,
  'min_samples_leaf': 2,
  'max_features': 'log2',
  'max_depth': 20,
  'class_weight': 'balanced',
  'bootstrap': True})

### **Model 3: Gaussian Naive Bayes**

(Maybe try Bernoulli Naive Bayes as well for classification problems)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
def tune_naivebayes_model(X_train_scaled, y_train):
    """
    Perform hyperparameter tuning for the Random Forest model
    using RandomizedSearchCV.

    Parameters:
    X_train: Training data (features)
    y_train: Training data (target)

    Returns:
    best_model: Logistic Regression model with the best hyperparameters
    best_params: Best hyperparameters found by RandomizedSearchCV
    """

    # Define the parameter grid for Random Forest
    # Define the parameter grid for Random Forest
    param_distributions = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]}


    # Instantiate the Logistic Regression model
    model = GaussianNB()

    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distributions,
                                       n_iter=4,  # Number of parameter settings sampled
                                       scoring='accuracy',  # Metric for evaluation
                                       cv=5,  # Number of cross-validation folds
                                       verbose=2,  # Show process logs
                                       random_state=42,
                                       n_jobs=-1)  # Use all available cores

    # Fit random search model
    random_search.fit(X_train_scaled, y_train)

    # Best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params

In [None]:
def gaussian_naive_bayes_model(X_train_scaled,
                              y_train,
                              X_test_scaled,
                              y_test):
    """
    Function to run the Logistic Regression model,
    perform hyperparameter tuning, and record results.

    Parameters:
    X_train: Training features dataset
    y_train: Training target variable
    X_test: Testing features dataset
    y_test: Testing target variable


    Returns:
    accuracy: accuracy score from the best model
    precision: precision score from the best model
    recall: recall (sensitivity) score from the best model
    f1: f1-score from the best model
    best_predictions: (y_pred) Predictions from the best model based on highest f1-score
    best_model_params: Hyperparameters of the best model based on highest f1-score
    """

    # Hyperparameter tuning
    best_model, best_params = tune_naivebayes_model(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)  # Sensitivity
    f1 = f1_score(y_test, y_pred, zero_division=0)



    return accuracy, precision, recall, f1, y_pred, best_params

In [None]:
nb_accuracy, nb_precision, nb_recall, nb_f1, nb_best, nb_best_model_params=gaussian_naive_bayes_model(X_train_scaled,
                                                                                                     y_train,
                                                                                                     X_test_scaled,
                                                                                                     y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




In [None]:
nb_accuracy, nb_precision, nb_recall, nb_f1, nb_best, nb_best_model_params

(0.35947712418300654,
 0.9818181818181818,
 0.3576158940397351,
 0.5242718446601942,
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
        1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1.,
        1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 {'var_smoothing': 1e-09})

### **Model 4: K Nearest Neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def tune_knn_model(X_train_scaled, y_train):
    """
    Perform hyperparameter tuning for the Random Forest model
    using RandomizedSearchCV.

    Parameters:
    X_train: Training data (features)
    y_train: Training data (target)

    Returns:
    best_model: Logistic Regression model with the best hyperparameters
    best_params: Best hyperparameters found by RandomizedSearchCV
    """

    # Define the parameter grid for Random Forest
    # Define the parameter grid for Random Forest
    param_distributions = {'n_neighbors': [3, 5, 7, 9, 11,30],       # Number of neighbors to consider
                            'weights': ['uniform', 'distance'],    # Weight function used in prediction
                            'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metric to use
                            'p': [3],                           # Power parameter for Minkowski metric: p=1 (Manhattan), p=2 (Euclidean)
                            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
                            'leaf_size': [10, 30, 50]}              # Leaf size for BallTree or KDTree algorithms


    # Instantiate the Logistic Regression model
    model = KNeighborsClassifier()

    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distributions,
                                       n_iter=10,  # Number of parameter settings sampled
                                       scoring='accuracy',  # Metric for evaluation
                                       cv=5,  # Number of cross-validation folds
                                       verbose=2,  # Show process logs
                                       random_state=42,
                                       n_jobs=-1)  # Use all available cores

    # Fit random search model
    random_search.fit(X_train_scaled, y_train)

    # Best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params

In [None]:
def knn_model(X_train_scaled,
                              y_train,
                              X_test_scaled,
                              y_test):
    """
    Function to run the Logistic Regression model,
    perform hyperparameter tuning, and record results.

    Parameters:
    X_train: Training features dataset
    y_train: Training target variable
    X_test: Testing features dataset
    y_test: Testing target variable


    Returns:
    accuracy: accuracy score from the best model
    precision: precision score from the best model
    recall: recall (sensitivity) score from the best model
    f1: f1-score from the best model
    best_predictions: (y_pred) Predictions from the best model based on highest f1-score
    best_model_params: Hyperparameters of the best model based on highest f1-score
    """

    # Hyperparameter tuning
    best_model, best_params = tune_knn_model(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)  # Sensitivity
    f1 = f1_score(y_test, y_pred, zero_division=0)



    return accuracy, precision, recall, f1, y_pred, best_params

In [None]:
knn_accuracy, knn_precision, knn_recall, knn_f1, knn_best, knn_best_model_params=knn_model(X_train_scaled,
                                                                                                     y_train,
                                                                                                     X_test_scaled,
                                                                                                     y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
knn_accuracy, knn_precision, knn_recall, knn_f1, knn_best, knn_best_model_params

(0.954248366013072,
 0.9864864864864865,
 0.9668874172185431,
 0.9765886287625418,
 array([1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 {'weights': 'uniform',
  'p': 1,
  'n_neighbors': 9,
  'metric': 'minkowski',
  'leaf_size': 30,
  'algorithm': 'ball_tree'})

### **Model 5: SVM**

In [None]:
from sklearn.svm import SVC

In [None]:
def tune_svm_model(X_train_scaled, y_train):
    """
    Perform hyperparameter tuning for the Random Forest model
    using RandomizedSearchCV.

    Parameters:
    X_train: Training data (features)
    y_train: Training data (target)

    Returns:
    best_model: Logistic Regression model with the best hyperparameters
    best_params: Best hyperparameters found by RandomizedSearchCV
    """

    # Define the parameter grid for Random Forest
    # Define the parameter grid for Random Forest
    param_distributions = {'C': [0.1, 1, 10, 100],                   # Regularization parameter
                          'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Specifies the kernel type to be used in the algorithm
                          'degree': [2, 3, 4],                      # Degree of the polynomial kernel function (used if kernel='poly')
                          'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
                          'coef0': [0, 0.1, 0.5, 1],                # Independent term in kernel function (used with 'poly' and 'sigmoid')
                          'class_weight': ['balanced', None] }


    # Instantiate the Logistic Regression model
    model = SVC()

    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distributions,
                                       n_iter=10,  # Number of parameter settings sampled
                                       scoring='accuracy',  # Metric for evaluation
                                       cv=5,  # Number of cross-validation folds
                                       verbose=2,  # Show process logs
                                       random_state=42,
                                       n_jobs=-1)  # Use all available cores

    # Fit random search model
    random_search.fit(X_train_scaled, y_train)

    # Best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params

In [None]:
def svm_model(X_train_scaled,
                              y_train,
                              X_test_scaled,
                              y_test):
    """
    Function to run the Logistic Regression model,
    perform hyperparameter tuning, and record results.

    Parameters:
    X_train: Training features dataset
    y_train: Training target variable
    X_test: Testing features dataset
    y_test: Testing target variable


    Returns:
    accuracy: accuracy score from the best model
    precision: precision score from the best model
    recall: recall (sensitivity) score from the best model
    f1: f1-score from the best model
    best_predictions: (y_pred) Predictions from the best model based on highest f1-score
    best_model_params: Hyperparameters of the best model based on highest f1-score
    """

    # Hyperparameter tuning
    best_model, best_params = tune_svm_model(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)  # Sensitivity
    f1 = f1_score(y_test, y_pred, zero_division=0)



    return accuracy, precision, recall, f1, y_pred, best_params

In [None]:
svm_accuracy, svm_precision, svm_recall, svm_f1, svm_best, svm_best_model_params=svm_model(X_train_scaled,
                                                                                                     y_train,
                                                                                                     X_test_scaled,
                                                                                                     y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
svm_accuracy, svm_precision, svm_recall, svm_f1, svm_best, svm_best_model_params

(0.9215686274509803,
 0.986013986013986,
 0.9337748344370861,
 0.9591836734693877,
 array([1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0.]),
 {'kernel': 'rbf',
  'gamma': 'auto',
  'degree': 4,
  'coef0': 0.5,
  'class_weight': None,
  'C': 10})

### **Model 6: XGBoost**

In [None]:
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {
    'objective': 'binary:logistic',  # for binary classification problems
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'logloss'  # you can change this based on your problem
}

In [None]:
def tune_xgboost_model(X_train_scaled, y_train):
    """
    Perform hyperparameter tuning for the Random Forest model
    using RandomizedSearchCV.

    Parameters:
    X_train: Training data (features)
    y_train: Training data (target)

    Returns:
    best_model: Logistic Regression model with the best hyperparameters
    best_params: Best hyperparameters found by RandomizedSearchCV
    """

    # Define the parameter grid for Random Forest
    # Define the parameter grid for Random Forest
    param_distributions = {'n_estimators': [50, 100, 200],              # Number of boosting rounds (trees)
                          'max_depth': [3, 5, 7, 10],                  # Maximum depth of a tree
                          'learning_rate': [0.01, 0.1, 0.2],           # Step size shrinkage used in update to prevent overfitting
                          'subsample': [0.6, 0.8, 1.0],                # Fraction of samples used for fitting individual trees
                          'colsample_bytree': [0.6, 0.8, 1.0],         # Fraction of features used for each tree
                          'gamma': [0, 0.1, 0.5, 1],                   # Minimum loss reduction required to make a further partition
                          'min_child_weight': [1, 3, 5],               # Minimum sum of instance weight (hessian) needed in a child
                          'reg_alpha': [0, 0.01, 0.1, 1],              # L1 regularization term on weights (can increase sparsity)
                          'reg_lambda': [1, 0.01, 0.1, 10],            # L2 regularization term on weights
                          'scale_pos_weight': [1, 5, 10] }


    # Instantiate the Logistic Regression model
    model = xgb.XGBClassifier()

    # Perform RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distributions,
                                       n_iter=10,  # Number of parameter settings sampled
                                       scoring='accuracy',  # Metric for evaluation
                                       cv=5,  # Number of cross-validation folds
                                       verbose=2,  # Show process logs
                                       random_state=42,
                                       n_jobs=-1)  # Use all available cores

    # Fit random search model
    random_search.fit(X_train_scaled, y_train)

    # Best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_

    return best_model, best_params

In [None]:
def xgb_model(X_train_scaled,
                              y_train,
                              X_test_scaled,
                              y_test):
    """
    Function to run the Logistic Regression model,
    perform hyperparameter tuning, and record results.

    Parameters:
    X_train: Training features dataset
    y_train: Training target variable
    X_test: Testing features dataset
    y_test: Testing target variable


    Returns:
    accuracy: accuracy score from the best model
    precision: precision score from the best model
    recall: recall (sensitivity) score from the best model
    f1: f1-score from the best model
    best_predictions: (y_pred) Predictions from the best model based on highest f1-score
    best_model_params: Hyperparameters of the best model based on highest f1-score
    """

    # Hyperparameter tuning
    best_model, best_params = tune_xgboost_model(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = best_model.predict(X_test_scaled)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)  # Sensitivity
    f1 = f1_score(y_test, y_pred, zero_division=0)



    return accuracy, precision, recall, f1, y_pred, best_params

In [None]:
xgb_accuracy, xgb_precision, xgb_recall, xgb_f1, xgb_best, xgb_best_model_params=xgb_model(X_train_scaled,
                                                                                                     y_train,
                                                                                                     X_test_scaled,
                                                                                                     y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
xgb_accuracy, xgb_precision, xgb_recall, xgb_f1, xgb_best, xgb_best_model_params

(0.9477124183006536,
 0.9863945578231292,
 0.9602649006622517,
 0.9731543624161074,
 array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 {'subsample': 0.8,
  'scale_pos_weight': 5,
  'reg_lambda': 0.1,
  'reg_alpha': 0,
  'n_estimators': 50,
  'min_child_weight': 3,
  'max_depth': 7,
  'learning_rate': 0.1,
  'gamma': 1,
  'colsample_bytree': 0.6})