In [5]:
# - Imports --------------------------------------------
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, cross_val_predict, StratifiedKFold
import sklearn.neighbors as skl_nb
import sklearn.discriminant_analysis as skl_da
# from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import sys

sys.path.append('../src')  # lägg till src i sökvägarna

In [6]:
# - Define train/test data -----------------------------
from data_preprocessing import X, y, get_pipeline, X_holdout, y_holdout

np.random.seed(1)

In [7]:
# - Final models -----------------------------------------
from sklearn.linear_model import LogisticRegression

# --

random_forest = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    max_features='sqrt',
    class_weight={'high_bike_demand': 16, 'low_bike_demand': 1},
    random_state=1,
    n_jobs=-1,
    min_samples_leaf=10,
    min_samples_split=2
)

kNN = skl_nb.KNeighborsClassifier(n_neighbors=6)

# --

log_reg = LogisticRegression(
    class_weight={'high_bike_demand': 10, 'low_bike_demand': 1},
    random_state=1,
    max_iter=1000,
    C=1.0,
    solver='liblinear',
)

# --

priors = [0.61, 0.39] 
LDA = skl_da.LinearDiscriminantAnalysis(priors=priors, tol=1e-4, )
QDA = skl_da.QuadraticDiscriminantAnalysis(reg_param=0.21052631578947367, priors=priors)


In [8]:
# Create a list of models to evaluate
models = [random_forest, kNN, log_reg, LDA, QDA]

In [9]:
# Testing all models
for model in models:
    pipeline = get_pipeline(model)
    model.fit(X, y)
    y_holdout_pred = model.predict(X_holdout)
    print(model, "\n", classification_report(y_holdout, y_holdout_pred), "\n\n")


RandomForestClassifier(class_weight={'high_bike_demand': 16,
                                     'low_bike_demand': 1},
                       max_depth=20, min_samples_leaf=10, n_estimators=300,
                       n_jobs=-1, random_state=1) 
                   precision    recall  f1-score   support

high_bike_demand       0.41      0.94      0.57        50
 low_bike_demand       0.99      0.75      0.85       270

        accuracy                           0.78       320
       macro avg       0.70      0.85      0.71       320
    weighted avg       0.90      0.78      0.81       320
 


KNeighborsClassifier(n_neighbors=6) 
                   precision    recall  f1-score   support

high_bike_demand       0.45      0.58      0.50        50
 low_bike_demand       0.92      0.87      0.89       270

        accuracy                           0.82       320
       macro avg       0.68      0.72      0.70       320
    weighted avg       0.84      0.82      0.83       320
 


Logis