In [1]:
# run only once
import sys
import os
sys.path.append("../../") 
os.chdir("../../")

In [6]:
import pandas as pd
import numpy as np

from src.data import (
    read_data
    )
from src.utils import (
    preprocess,
    train_and_tune_model,
    run_experiments,
    save_experiment_results,
    load_experiment_results
)

from src.plot import plot_experiment_results

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from src.plot import plot_experiment_results

In [4]:
separation_thresholds = {
    '0-3': 1.3,  # 0-3 arcmin
    '3-6': 1.3,  # 3-6 arcmin
    '6+': 2.2    # 6+ arcmin
}

# read the pos and neg datasets
results = read_data(separation_thresholds)

In [20]:
# access the positive and negative sets for each threshold range
df_pos_0_3 = results['0-3']['df_pos']
df_neg_0_3 = results['0-3']['df_neg']
df_pos_3_6 = results['3-6']['df_pos']
df_neg_3_6 = results['3-6']['df_neg']
df_pos_6_plus = results['6+']['df_pos']
df_neg_6_plus = results['6+']['df_neg']

---
#### Preprocessing

In [8]:
# preprocess the data without log transformation
# split into training and test sets
X_train, X_test, Y_train, Y_test, indices_train, indices_test = preprocess(df_pos_0_3, df_neg_0_3, log_transform=False)

---
#### Baseline Models

In [9]:
# model. either rf or lgbm.
best_model, y_pred, best_params = train_and_tune_model(X_train, X_test, Y_train, Y_test, model_type='rf', hyperparameter_tuning=True)

Performing hyperparameter tuning...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 3.0min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 3.1min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 3.1min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 3.4min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 3.4min
[CV] END max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=700; total time= 4.3min
[CV] END max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=700; total time= 4.8min
[CV] END max_depth=70, min_samples_leaf=2, min_samples_split=5, n_estimators=700; total time= 4.8min
[CV] END max_depth=100, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time= 



[CV] END max_depth=100, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time= 3.7min
[CV] END max_depth=100, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time= 3.7min
[CV] END max_depth=100, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time= 4.0min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time= 4.7min
[CV] END max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time= 3.5min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time= 4.7min
[CV] END max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time= 3.5min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time= 5.0min
[CV] END max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time= 3.7min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total 

In [None]:
# run experiment
results_exp = run_experiments(df_pos_0_3, df_neg_0_3, model_type='rf', hyperparameter_tuning=False)

In [23]:
# save results
experiment_path = save_experiment_results(results_exp, model_type='rf', hyperparameter_tuning=False, random_seed=42)

experiment results saved in: models/rf_default_seed42_20240708_142049


In [9]:
# load results
results_exp = load_experiment_results('models/rf_default_seed42_20240708_142049')

experiment results loaded from: models/rf_default_seed42_20240708_142049


In [49]:
# plot experiments
plot_experiment_results(results_exp, df_pos_0_3, df_neg_0_3, "kashyap_2024_07_orion", "0-3arcmin")

Results saved to figures/experiments/kashyap_2024_07_orion/0-3arcmin_experiment_results.pdf
