In [1]:
from libs.model_processor import read_params
import pandas as pd
import json

def print_params(model: str):
    models_params = read_params()
    model = next(obj for obj in models_params if obj["model"] == model)
    enumeration_count = 1
    for key, value in model["params"].items():
        enumeration_count *= len(value)
        print(f"{key}: {value}")
    print("")
    print(f"Total number of combitations: {enumeration_count}")


In [2]:
print_params("LogisticRegression")

penalty: [None, 'l2']
solver: ['lbfgs', 'sag', 'saga']
C: [0.001, 0.01, 0.1, 1.0, 10, 100]
max_iter: [10000]

Total number of combitations: 36


In [3]:
print_params("KNeighborsClassifier")

n_neighbors: [1, 2, 3, 5, 8, 13, 21]
p: [1, 2]
weights: ['uniform', 'distance']

Total number of combitations: 28


In [4]:
print_params("GaussianNB")


Total number of combitations: 1


In [5]:
print_params("DecisionTreeClassifier")

criterion: ['gini', 'entropy', 'log_loss']
splitter: ['best', 'random']
max_depth: [None, 10, 20, 30, 50, 100, 200]

Total number of combitations: 42


In [6]:
print_params("RandomForestClassifier")

criterion: ['gini', 'entropy', 'log_loss']
max_depth: [None, 10, 20, 30, 50, 100, 200]

Total number of combitations: 21


In [7]:
print_params("SVC")

C: [0.001, 0.01, 0.1, 1.0, 10, 100]
kernel: ['poly', 'rbf', 'sigmoid']

Total number of combitations: 18


## prepare all datasets

In [8]:
from libs.scaler import min_max_scaler
from libs.scaler import black_and_white_scaler
from libs.mnist_reader import load_mnist
from sklearn.decomposition import PCA

X_train_mnist, Y_train_mnist = load_mnist(path="data/mnist", kind="train")
X_train_fashion, Y_train_fashion = load_mnist(path="data/fashion", kind="train")

mnist_black_and_white_no_scaling = black_and_white_scaler(X_train_mnist)
fashion_black_and_white_no_scaling = black_and_white_scaler(X_train_fashion)

datasets = {
    # mnist
    "mnist_original": X_train_mnist,
    "mnist_black_and_white_no_scaling": mnist_black_and_white_no_scaling,
    "mnist_original_89_attributes": PCA(n_components=89).fit_transform(X_train_mnist),
    "mnist_black_and_white_233_attributes": PCA(n_components=233).fit_transform(mnist_black_and_white_no_scaling),
    # fashion
    "fashion_original": X_train_fashion,
    "fashion_black_and_white_no_scaling": fashion_black_and_white_no_scaling,
    "fashion_original_89_attributes": PCA(n_components=89).fit_transform(X_train_fashion),
    "fashion_black_and_white_233_attributes": PCA(n_components=233).fit_transform(fashion_black_and_white_no_scaling),
}

## Define single iteration

In [13]:
#| code-fold: show
#| label: single_iteration
from sklearn.model_selection import GridSearchCV
from libs.model_processor import create_default_model
from pathlib import Path
import pandas as pd

def iterate_model(model_dict: dict, X: list, y: list, dataset_name: str):
    Path("output").mkdir(parents=True, exist_ok=True)
    model = create_default_model(model_dict["model"])
    result_file = f"output/{dataset_name}_{model_dict['model']}.csv"
    if Path(result_file).is_file():
        print(f"Skipping '{result_file}' since in alredy exist")
        return
    
    clf = GridSearchCV(estimator=model, param_grid=model_dict["params"], verbose=3, cv=[(slice(None), slice(None))])
    clf.fit(X, y)

    df = pd.DataFrame(clf.cv_results_)
    df.to_csv(result_file)

    return clf.cv_results_
    

In [10]:
models_params = read_params()
# for each dataset
for key in datasets:
    # for each model in dict
    for model_dict in models_params:
        X = datasets[key]
        if key.startswith("mnist"):
            y = Y_train_mnist
        elif key.startswith("fashion"):
            y = Y_train_fashion
        else:
            raise KeyError
        print(f"For dataset {key} and {model_dict['model']} model, see _05_modeling.py")
        # iterate_model(model_dict, X=X, y=y, dataset_name=key)


For dataset mnist_original and LogisticRegression model, see _05_modeling.py
For dataset mnist_original and KNeighborsClassifier model, see _05_modeling.py
For dataset mnist_original and GaussianNB model, see _05_modeling.py
For dataset mnist_original and DecisionTreeClassifier model, see _05_modeling.py
For dataset mnist_original and RandomForestClassifier model, see _05_modeling.py
For dataset mnist_original and SVC model, see _05_modeling.py
For dataset mnist_black_and_white_no_scaling and LogisticRegression model, see _05_modeling.py
For dataset mnist_black_and_white_no_scaling and KNeighborsClassifier model, see _05_modeling.py
For dataset mnist_black_and_white_no_scaling and GaussianNB model, see _05_modeling.py
For dataset mnist_black_and_white_no_scaling and DecisionTreeClassifier model, see _05_modeling.py
For dataset mnist_black_and_white_no_scaling and RandomForestClassifier model, see _05_modeling.py
For dataset mnist_black_and_white_no_scaling and SVC model, see _05_modeli

In [39]:
s

df = pd.concat(map(pd.read_csv, filenames), ignore_index=True) 

print(f"For {len(df)} tested models.")
total_fit_time = int(df["mean_fit_time"].sum())
total_score_time = int(df["mean_score_time"].sum())
total_time = total_fit_time + total_score_time
print(f"Overall time to fit models: {datetime.timedelta(seconds = total_fit_time)}")
print(f"Overall time to score models: {datetime.timedelta(seconds = total_score_time)}")
print()
print(f"Total time: {datetime.timedelta(seconds = total_time)}")


For 146 tested models.
Overall time to fit models: 5 days, 22:09:13
Overall time to score models: 1 day, 23:06:06

Total time: 7 days, 21:15:19
