In [1]:
import os
import sys
import subprocess
import pkg_resources


def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])


# List of required packages
required_packages = ['ipywidgets', 'jupyter', 'matplotlib', 'numpy', 'pandas',
                     'optuna', 'optuna-integration[sklearn]', 'scikit-learn']

# Check if packages are installed, and install if not
for package in required_packages:
    try:
        pkg_resources.get_distribution(package)
    except pkg_resources.DistributionNotFound:
        install(package)

# Check if the environment is activated
if 'CONDA_DEFAULT_ENV' in os.environ:
    print(f"Environment '{os.environ['CONDA_DEFAULT_ENV']}' is activated.")
else:
    print("No specific environment is activated.")

Environment 'research' is activated.


In [2]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [3]:
# # Load the store sales dataset
# excel_file = pd.ExcelFile('data/store_sales.xlsx')
# sheet_names = excel_file.sheet_names

# # Read the data
# results_df = pd.read_excel(excel_file, sheet_name=sheet_names[2])  # 2, 9
# iri_key_counts = results_df['IRI_KEY'].value_counts()
# iri_keys = iri_key_counts[iri_key_counts > 300].index


# features = ['F', 'D', 'Unit.Price']
# target = 'Total.Volume'

# results_df = results_df[results_df['IRI_KEY'] == iri_keys[0]]
# X = results_df[features].values
# y = results_df[target].values

# sheet_names, iri_keys, X.shape, y.shape

In [4]:
# # Load the Boston dataset
# data_url = 'http://lib.stat.cmu.edu/datasets/boston'
# raw_df = pd.read_csv(data_url, sep='\s+', skiprows=22,  # type: ignore
#                      header=None)  # type: ignore
# X = np.hstack([raw_df.values[::2, :-1], raw_df.values[1::2, :2]])
# y = raw_df.values[1::2, 2].reshape(-1, 1).ravel()
# X.shape, y.shape

In [5]:
# Load the California housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
X = housing.data  # type: ignore
y = housing.target  # type: ignore
X.shape, y.shape, housing.feature_names  # type: ignore

((20640, 8),
 (20640,),
 ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'])

In [6]:
# # Load the soybean dataset
# raw_df = pd.read_excel("data/soybean.xlsx")
# # print(raw_df.head())
# X = raw_df.values[:-1, [5, 6, 15, 16, 17, 26,
#                         34, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]]  # 9 check yield 12 rm band
# y = raw_df.values[:-1, 11]
# X.shape, y.shape

In [7]:
# # Define the data
# data = {
#     "Size": [850, 900, 1200, 1400, 1600, 1700, 1800, 2000, 2200, 2500],
#     "Bedrooms": [2, 3, 3, 3, 3, 4, 4, 4, 5, 5],
#     "Price": [300, 340, 400, 500, 520, 580, 600, 620, 720, 790]
# }

# df = pd.DataFrame(data)

# X = df[["Size", "Bedrooms"]].values
# y = df["Price"].values
# X.shape, y.shape

In [8]:
from sklearn.preprocessing import StandardScaler, QuantileTransformer

X = X.astype(np.float64)
y = y.astype(np.float64)

# scaler_X = StandardScaler()
# X = scaler_X.fit_transform(X)
# scaler_y = StandardScaler()
# y = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()  # type: ignore
# scaler_X = QuantileTransformer()
# X = scaler_X.fit_transform(X)
# scaler_y = QuantileTransformer()
# y = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()  # type: ignore

In [9]:
from optuna.integration.sklearn import OptunaSearchCV
from optuna import distributions
from models_sklearn import Ensemble, MLP_sk, FONN1_sk, FONN2_sk, TREENN1_sk, TREENN2_sk
from models import MLP, FONN1, FONN2, TREENN1, TREENN2

search_params = {
    'cv': 5,
    'n_jobs': -1,
    'n_trials': 10,
    'random_state': 42,
    'return_train_score': True,
    'scoring': 'neg_root_mean_squared_error',
    'timeout': 30,
    'verbose': 1
}

mlp_sk_param_grid = {
    'learning_rate_init': distributions.FloatDistribution(1e-3, 1e-1, log=True),
    'max_iter': distributions.CategoricalDistribution([200]),
    # 'max_iter': distributions.IntDistribution(200, 1000, log=True),
}
mlp_param_grid = {
    'learning_rate': distributions.FloatDistribution(1e-3, 1e-1, log=True),
    'epochs': distributions.CategoricalDistribution([200]),
    # 'epochs': distributions.IntDistribution(200, 1000, log=True),
}


def search(model, param_grid={}):
    return OptunaSearchCV(model, param_grid, **search_params)

  raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22,  # type: ignore


In [10]:
models = {}

num_trees_input = 5
num_trees_hidden = 5
hidden_nodes = [10]

models['Tree'] = search(Ensemble(1))
for hn in hidden_nodes:
    models[f'MLP_sk {hn}'] = search(
        MLP_sk(hn), mlp_sk_param_grid)
    models[f'MLP_sk_tanh {hn}'] = search(
        MLP_sk(hn, activation='tanh'), mlp_sk_param_grid)
    if isinstance(hn, tuple):
        models[f'Ensemble {sum(hn)}'] = search(Ensemble(sum(hn)))
        models[f'FONN1_sk {num_trees_input} {hn}'] = search(
            FONN1_sk(num_trees_input, (num_trees_input+hn[0], *hn[1:])), mlp_sk_param_grid)
        models[f'FONN2_sk {num_trees_hidden} {hn}'] = search(
            FONN2_sk(num_trees_hidden, (*hn[:-1], num_trees_hidden+hn[-1])), mlp_sk_param_grid)
        models[f'TREENN1_sk {hn}'] = search(
            TREENN1_sk((1+hn[0], *hn[1:])), mlp_sk_param_grid)
        models[f'TREENN2_sk {hn}'] = search(
            TREENN2_sk((*hn[:-1], 1+hn[-1])), mlp_sk_param_grid)
    else:
        models[f'Ensemble_sk {hn}'] = search(Ensemble(hn))
        models[f'FONN1_sk {num_trees_input} {hn}'] = search(
            FONN1_sk(num_trees_input, num_trees_input+hn), mlp_sk_param_grid)
        models[f'FONN2_sk {num_trees_hidden} {hn}'] = search(
            FONN2_sk(hn, num_trees_hidden+hn), mlp_sk_param_grid)
        # models[f'TREENN1_sk {hn}'] = search(
        #     TREENN1_sk(1+hn), mlp_sk_param_grid)
        # models[f'TREENN2_sk {hn}'] = search(
        #     TREENN2_sk(1+hn), mlp_sk_param_grid)

input_dim = X.shape[1]
output_dim = 1

# for hn in hidden_nodes:
#     models[f'MLP {hn}'] = search(
#         MLP(input_dim, hn, output_dim), mlp_param_grid)
#     models[f'MLP_tanh {hn}'] = search(
#         MLP(input_dim, hn, output_dim, activation='tanh'), mlp_param_grid)
#     models[f'FONN1 {num_trees_input} {hn}'] = search(
#         FONN1(input_dim, hn, output_dim, num_trees_input), mlp_param_grid)
#     models[f'FONN2 {num_trees_hidden} {hn}'] = search(
#         FONN2(input_dim, hn, output_dim, num_trees_hidden), mlp_param_grid)
#     models[f'TREENN1 {hn}'] = search(
#         FONN1(input_dim, hn, output_dim, 1), mlp_param_grid)
#     models[f'TREENN2 {hn}'] = search(
#         FONN2(input_dim, hn, output_dim, 1), mlp_param_grid)

models

  return OptunaSearchCV(model, param_grid, **search_params)


{'Tree': OptunaSearchCV(cv=5, estimator=Ensemble(n_estimators=1), n_jobs=-1,
                param_distributions={}, random_state=42, return_train_score=True,
                scoring='neg_root_mean_squared_error', timeout=30, verbose=1),
 'MLP_sk 10': OptunaSearchCV(cv=5, estimator=MLP_sk(hidden_layer_sizes=10), n_jobs=-1,
                param_distributions={'learning_rate_init': FloatDistribution(high=0.1, log=True, low=0.001, step=None),
                                     'max_iter': CategoricalDistribution(choices=(200,))},
                random_state=42, return_train_score=True,
                scoring='neg_root_mean_squared_error', timeout=30, verbose=1),
 'MLP_sk_tanh 10': OptunaSearchCV(cv=5, estimator=MLP_sk(activation='tanh', hidden_layer_sizes=10),
                n_jobs=-1,
                param_distributions={'learning_rate_init': FloatDistribution(high=0.1, log=True, low=0.001, step=None),
                                     'max_iter': CategoricalDistribution(choices

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import sys
from warnings import simplefilter
if not sys.warnoptions:
    simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

# Train and evaluate models
cv_results = {}
results = []

result_columns = [
    'mean_fit_time', 'mean_score_time', 'mean_train_score', 'mean_test_score'
]

def fit_model(name, model, X, y):
    model.fit(X, y)
    result = model.cv_results_
    return {
        'model': name,
        **{col: result[col][model.best_index_] for col in result}
    }

with ThreadPoolExecutor(max_workers=None) as executor:
    future_to_model = {executor.submit(fit_model, name, model, X, y): name for name, model in models.items()}
    for future in as_completed(future_to_model):
        name = future_to_model[future]
        try:
            result = future.result()
            results.append(result)
            cv_results[name] = result
        except Exception as e:
            print(f"Exception in {name}: {e}")

results_df = pd.DataFrame(results)
results_df.set_index('model', inplace=True)
results_df = results_df[result_columns]
results_df

[I 2024-11-14 01:59:10,192] A new study created in memory with name: no-name-5ee021a6-2403-4e19-af1b-db865a5719a1
[I 2024-11-14 01:59:10,198] A new study created in memory with name: no-name-fd0eb02b-99a9-47f1-ad2e-778859bb42c7


[I 2024-11-14 01:59:10,199] A new study created in memory with name: no-name-8d9f25aa-bf29-4a0d-975e-ae6b19a9d940
[I 2024-11-14 01:59:10,201] A new study created in memory with name: no-name-13ddb3c5-d23b-4611-b100-c97729f867ec
[I 2024-11-14 01:59:10,206] A new study created in memory with name: no-name-f07b4370-c06f-48a2-8db8-e46f11d6f560
[I 2024-11-14 01:59:10,206] A new study created in memory with name: no-name-4ed40937-7712-4041-a0a2-9b2fb2934ef6
[I 2024-11-14 01:59:12,255] Trial 6 finished with value: -0.9401539868300061 and parameters: {}. Best is trial 6 with value: -0.9401539868300061.
[I 2024-11-14 01:59:12,310] Trial 7 finished with value: -0.9205819172104915 and parameters: {}. Best is trial 7 with value: -0.9205819172104915.
[I 2024-11-14 01:59:12,331] Trial 3 finished with value: -0.9238866637197287 and parameters: {}. Best is trial 7 with value: -0.9205819172104915.
[I 2024-11-14 01:59:12,346] Trial 0 finished with value: -0.9452869825535697 and parameters: {}. Best is t

In [None]:
results_df.sort_values(by='mean_test_score', ascending=False)

In [None]:
[(name, model.best_params_) for name, model in models.items()]

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

for model_name, model in models.items():
    if hasattr(model.best_estimator_, 'loss_curve_'):
        ax1.plot(model.best_estimator_.loss_curve_, label=model_name)
        ax2.plot(model.best_estimator_.loss_curve_, label=model_name)

ax1.set_xlabel('Iterations')
ax1.set_ylabel('Loss')
ax1.set_title('All models')
ax1.legend()
ax1.grid(True)

ax2.set_yscale('log')
ax2.set_xlabel('Iterations')
ax2.set_ylabel('Loss')
ax2.set_title('All models')
ax2.legend()
ax2.grid(True)

fig.tight_layout()
fig.show()

In [None]:
import matplotlib.pyplot as plt

plot_groups = {}
for model_name, model in models.items():
    if hasattr(model.best_estimator_, 'loss_curve_'):
        key = model_name.split('_' if '_' in model_name else ' ')[0]
        if key not in plot_groups:
            plot_groups[key] = plt.subplots(1, 2, figsize=(14, 6))
        fig, (ax1, ax2) = plot_groups[key]
        ax1.plot(model.best_estimator_.loss_curve_, label=model_name)
        ax2.plot(model.best_estimator_.loss_curve_, label=model_name)

for group, plot in plot_groups.items():
    fig, (ax1, ax2) = plot
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Loss')
    ax1.set_title(group)
    ax1.legend()
    ax1.grid(True)

    ax2.set_yscale('log')
    ax2.set_xlabel('Iterations')
    ax2.set_ylabel('Loss')
    ax2.set_title(group)
    ax2.legend()
    ax2.grid(True)

    fig.tight_layout()
    fig.show()

In [None]:
import matplotlib.pyplot as plt


def plot_loss(model, title='Loss Curve'):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    ax1.plot(model.loss_curve_)
    ax1.set_xlabel('Iterations')
    ax1.set_ylabel('Loss')
    ax1.set_title(title)
    ax1.grid(True)

    ax2.plot(model.loss_curve_)
    ax2.set_yscale('log')
    ax2.set_xlabel('Iterations')
    ax2.set_ylabel('Loss')
    ax2.set_title(title)
    ax2.grid(True)

    fig.tight_layout()
    fig.show()


for model_name, model in models.items():
    if hasattr(model.best_estimator_, 'loss_curve_'):
        plot_loss(model.best_estimator_, model_name)