In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
# from pptoolbox import cross_group_predict

import warnings
warnings.simplefilter("ignore")

import pickle as pkl
from pathlib import Path

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# import data

In [5]:
datasets = {
    # 'full': {},
    'nondeo': {}
}

In [6]:
for dataset in datasets:
    print(f'Processing {dataset} dataset')

    save_dir = f"../data/cargill/{dataset}"

    X_train = pd.read_csv(f'{save_dir}/Xtrain.csv', index_col=0)
    X_test = pd.read_csv(f'{save_dir}/Xtest.csv', index_col=0)
    y_train = pd.read_csv(f'{save_dir}/ytrain.csv', index_col=0)
    y_test = pd.read_csv(f'{save_dir}/ytest.csv', index_col=0)

    datasets[dataset]['X_train'] = X_train
    datasets[dataset]['X_test'] = X_test
    datasets[dataset]['y_train'] = y_train
    datasets[dataset]['y_test'] = y_test

    print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
    print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')
    print()

Processing nondeo dataset
X_train: (2178, 191), y_train: (2178, 18)
X_test: (533, 191), y_test: (533, 18)



In [7]:
y_test.columns

Index(['specimen_id', 'lot_name', 'date_scanned', 'Acetic', 'Ash',
       'Astringent', 'Earthy', 'Fishy', 'Musty',
       'Oxidized (Old Butter-Stale-Cardboard)', 'Painty (Solvent-Chemical)',
       'Petroleum (Motor Oil-Rubber)', 'Putrid', 'Sensory Result', 'Smoky',
       'Sour', 'Sensory Value', 'scan_month'],
      dtype='object')

In [8]:
all_params = list(y_test.columns)
all_params

['specimen_id',
 'lot_name',
 'date_scanned',
 'Acetic',
 'Ash',
 'Astringent',
 'Earthy',
 'Fishy',
 'Musty',
 'Oxidized (Old Butter-Stale-Cardboard)',
 'Painty (Solvent-Chemical)',
 'Petroleum (Motor Oil-Rubber)',
 'Putrid',
 'Sensory Result',
 'Smoky',
 'Sour',
 'Sensory Value',
 'scan_month']

In [13]:
params_to_train = [
    'Acetic',
    # 'Ash',
    # 'Astringent',
    # 'Bitter',
    # 'Brown Fruit',
    # 'Cacao (Chocolate)',
    # 'Carbon',
    # 'Deodorization Level',
    # 'Earthy',
    # 'Fishy',
    # 'Heated Fat (Oil)',
    # 'Musty',
    # 'Oxidized (Old Butter-Stale-Cardboard)',
    # 'Painty (Solvent-Chemical)',
    # 'Petroleum (Motor Oil-Rubber)',
    # 'Putrid',
    # 'Sensory Result',
    # 'Smoky',
    # 'Sour',
    # 'Straw (Hay)',
    # 'Sensory Value'
    ]

In [14]:
remaining_params = set(params_to_train)
remaining_params = list(set(remaining_params) - set(['Deodorization Level', 'Sensory Value']))
remaining_params.sort()
remaining_params

['Acetic']

# Classify

In [15]:
%autoreload 2

import sys
import os
from optuna.samplers import TPESampler

sys.path.append(os.path.abspath("../src"))

from configs import (
    STORAGE_URL,
)

from trainer import BaseClassifyTrainerV4,MultiObjClassifyTrainerV4
from objective import DefaultClassifyV4, CustomObjectiveImbaClassify, MultiObjectiveImbaClassify

### multi obj

In [17]:
for dataset in datasets:

    print(f'Processing {dataset} dataset')
    X_train = datasets[dataset]['X_train']
    y_train = datasets[dataset]['y_train']
    X_test = datasets[dataset]['X_test']
    y_test = datasets[dataset]['y_test']

    for i, param in enumerate(remaining_params):

        print(f'Training model for {param}')
        
        output_dir = Path(f"../models/imba_constrained/{dataset}/{param}")
        output_dir.mkdir(parents=True, exist_ok=True)

        # check if model exists
        if (output_dir / "trainer.pkl").exists():
            print("Model exists, skipping training")
            continue
        else:
            print("Model does not exist, training")

        selected_y_train = y_train[param]
        selected_y_test = y_test[param]

        #check if encoder exists
        if (output_dir / "encoder.pkl").exists():
            encoder = pkl.load(open(output_dir / "encoder.pkl", "rb"))
            selected_y_train_encoded = pd.Series(
                encoder.transform(selected_y_train),
                index = selected_y_train.index
            )
            selected_y_test_encoded = pd.Series(
                encoder.transform(selected_y_test),
                index = selected_y_test.index
            )
        else:
            encoder = LabelEncoder()
            selected_y_train_encoded = pd.Series(
                encoder.fit_transform(selected_y_train),
                index = selected_y_train.index
            )
            selected_y_test_encoded = pd.Series(
                encoder.transform(selected_y_test),
                index = selected_y_test.index
            )

            pkl.dump(encoder, open(output_dir / "encoder.pkl", "wb"))

        MODULE_NAME = "Classify"
        PROJECT_CODE = "X24-028"
        MLFLOW_EXPERIMENT_NAME = f"{MODULE_NAME}_{PROJECT_CODE}_{dataset}_{param}_multi_imba"
        OPTUNA_STUDY_NAME = f"{MLFLOW_EXPERIMENT_NAME}_RUN-2"
        SEED = 42
        N_STARTUP_TRIALS = 150
        N_TOTAL_TRIALS = 400

        sampler = TPESampler(seed=SEED, 
                            n_startup_trials=N_STARTUP_TRIALS, 
                            multivariate=True,
                            warn_independent_sampling=False)
        
        objective = MultiObjectiveImbaClassify(
            X_train = X_train,
            y_train = selected_y_train_encoded,
            X_test = X_test,
            y_test = selected_y_test_encoded,

        )
        directions = ["maximize","maximize"] # depends on your objective
        metric_names = ["minority_recall", "balanced_accuracy"]

        classify_trainer = MultiObjClassifyTrainerV4(
            mlflow_experiment_name = MLFLOW_EXPERIMENT_NAME,
            optuna_study_name = OPTUNA_STUDY_NAME,
            optuna_storage_url = STORAGE_URL,
            sampler = sampler,
            n_total_trials = N_TOTAL_TRIALS,
            objective = objective,
            direction = directions,
            metric_name = metric_names,
            seed = SEED,
            additional_tags = {f'metric_{i+1}': metric_names[i] for i in range(len(metric_names))},
            enable_checkpointing=True,  # Enable the new feature
            sampler_checkpoint_path=None,
            encoder = encoder,
        )

        classify_trainer.run(save_best_model=True)

        # pkl.dump(best_model, open(output_dir / "trainer.pkl", "wb"))
        # pkl.dump(chosen_pipeline, open(output_dir / "origin_model_prediction.pkl", "wb"))


Processing nondeo dataset
Training model for Acetic
Model does not exist, training




In [25]:
classify_trainer.get_study().best_trials

[FrozenTrial(number=177, state=TrialState.COMPLETE, values=[0.8658536585365854, 0.7111876988335101], datetime_start=datetime.datetime(2025, 4, 9, 21, 12, 17, 224706), datetime_complete=datetime.datetime(2025, 4, 9, 21, 13, 6, 945042), params={'mask': 0, 'preprocessor': 'SNV-SG', 'window': 11, 'deriv': 1, 'polyorder': 3, 'model': 'SVC(RBF)', 'C': 1.8335503342186157, 'gamma': 0.9644095683465055, 'sampler_method': 'Under', 'sampler_Under': 'ClusterCentroids', 'dim_red': True, 'dim_red_method': 'umap', 'dim_red__n_components': 5, 'dim_red__n_neighbors': 19}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'mask': CategoricalDistribution(choices=(0, 1, 2)), 'preprocessor': CategoricalDistribution(choices=('SNV', 'SG', 'SNV-SG')), 'window': IntDistribution(high=29, log=False, low=5, step=2), 'deriv': IntDistribution(high=2, log=False, low=1, step=1), 'polyorder': IntDistribution(high=3, log=False, low=2, step=1), 'model': CategoricalDistribution(choices=('DecisionTree'

In [26]:
best_trials = classify_trainer.get_study().best_trials
results = np.array([trial.values for trial in best_trials])
results = pd.DataFrame(results).sort_values([0, 1], ascending=False)
results

Unnamed: 0,0,1
0,0.865854,0.711188
1,0.841463,0.72508


In [27]:
best_params = best_trials[1].params
best_params

{'mask': 0,
 'preprocessor': 'SNV-SG',
 'window': 9,
 'deriv': 1,
 'polyorder': 3,
 'model': 'SVC(RBF)',
 'C': 1.5040755926593568,
 'gamma': 0.035052012217756216,
 'sampler_method': 'Under',
 'sampler_Under': 'ClusterCentroids',
 'dim_red': True,
 'dim_red_method': 'umap',
 'dim_red__n_components': 10,
 'dim_red__n_neighbors': 18}

In [28]:
classify_trainer.save_best_model(best_params)



### single obj

In [None]:
for dataset in datasets:

    print(f'Processing {dataset} dataset')
    X_train = datasets[dataset]['X_train']
    y_train = datasets[dataset]['y_train']
    X_test = datasets[dataset]['X_test']
    y_test = datasets[dataset]['y_test']

    for i, param in enumerate(remaining_params):

        print(f'Training model for {param}')
        
        output_dir = Path(f"../models/imba_constrained/{dataset}/{param}")
        output_dir.mkdir(parents=True, exist_ok=True)

        # check if model exists
        if (output_dir / "trainer.pkl").exists():
            print("Model exists, skipping training")
            continue
        else:
            print("Model does not exist, training")

        selected_y_train = y_train[param]
        selected_y_test = y_test[param]

        #check if encoder exists
        if (output_dir / "encoder.pkl").exists():
            encoder = pkl.load(open(output_dir / "encoder.pkl", "rb"))
            selected_y_train_encoded = pd.Series(
                encoder.transform(selected_y_train),
                index = selected_y_train.index
            )
            selected_y_test_encoded = pd.Series(
                encoder.transform(selected_y_test),
                index = selected_y_test.index
            )
        else:
            encoder = LabelEncoder()
            selected_y_train_encoded = pd.Series(
                encoder.fit_transform(selected_y_train),
                index = selected_y_train.index
            )
            selected_y_test_encoded = pd.Series(
                encoder.transform(selected_y_test),
                index = selected_y_test.index
            )

            pkl.dump(encoder, open(output_dir / "encoder.pkl", "wb"))

        MODULE_NAME = "Classify"
        PROJECT_CODE = "X24-028"
        MLFLOW_EXPERIMENT_NAME = f"{MODULE_NAME}_{PROJECT_CODE}_{dataset}_{param}_imba-constrained"
        OPTUNA_STUDY_NAME = f"{MLFLOW_EXPERIMENT_NAME}_RUN-1_PR70"
        SEED = 42
        N_STARTUP_TRIALS = 100
        N_TOTAL_TRIALS = 200

        sampler = TPESampler(seed=SEED, 
                            n_startup_trials=N_STARTUP_TRIALS, 
                            multivariate=True,
                            warn_independent_sampling=False)
        
        objective = CustomObjectiveImbaClassify(
            X_train = X_train,
            y_train = selected_y_train_encoded,
            X_test = X_test,
            y_test = selected_y_test_encoded,

        )
        direction = "maximize" # depends on your objective
        metric_name = ["minority_recall"]

        classify_trainer = BaseClassifyTrainerV4(
            mlflow_experiment_name = MLFLOW_EXPERIMENT_NAME,
            optuna_study_name = OPTUNA_STUDY_NAME,
            optuna_storage_url = STORAGE_URL,
            sampler = sampler,
            n_total_trials = N_TOTAL_TRIALS,
            objective = objective,
            direction = direction,
            metric_name = metric_name,
            seed = SEED,
            additional_tags = {'metric': metric_name[0]},
            enable_checkpointing=True,  # Enable the new feature
            sampler_checkpoint_path=None,
            encoder = encoder,
        )

        classify_trainer.run(save_best_model=True)

        # pkl.dump(best_model, open(output_dir / "trainer.pkl", "wb"))
        # pkl.dump(chosen_pipeline, open(output_dir / "origin_model_prediction.pkl", "wb"))


In [None]:
model_path = Path("../models/model.pkl")

with open(model_path, "rb") as f:
    model = pkl.load(f)

model.named_steps.keys()

In [None]:
import optuna
# List all studies in the database
optuna.get_all_study_names(storage=STORAGE_URL)

# download models from mlflow

In [23]:
import mlflow
import os

# Get the client
client = mlflow.tracking.MlflowClient()

# Get the list of all artifacts in the run
artifacts = client.list_artifacts(run_id)

# Function to recursively download artifacts
def download_artifacts(client, run_id, artifact_path, local_dir):
    # List all sub-artifacts in the given artifact_path
    artifacts = client.list_artifacts(run_id, artifact_path)
    
    # If it's a directory, recursively download its contents
    if len(artifacts) > 0:
        for subartifact in artifacts:
            # Get the parent directory path and the file name
            subartifact_parent_dir, subartifact_file = os.path.split(subartifact.path)
            
            # Create local directory structure (exclude file name)
            subartifact_local_path = os.path.join(local_dir, subartifact_parent_dir)
            os.makedirs(subartifact_local_path, exist_ok=True)
            
            # Recursively download sub-artifacts
            download_artifacts(client, run_id, subartifact.path, local_dir)
    else:
        # If it's a file, ensure the parent directory exists and then download it
        local_file_path = local_dir
        
        # Get the parent directory (exclude file name)
        local_dir_path = os.path.dirname(local_file_path)
        
        # Create the parent directory if it doesn't exist
        os.makedirs(local_dir_path, exist_ok=True)
        
        # Download the file to the correct local path
        client.download_artifacts(run_id, artifact_path, dst_path=local_file_path)
        print(f"Downloaded {artifact_path} to {local_file_path}")


# Specify the experiment and run ID
experiment_id = "754667177352290429"
run_id = "1e7f970f6a8c4d07bea3eca411fb6dbc"
# Specify where you want to download the artifacts locally (this is the root directory)
param = "Acetic"
local_directory = Path (f"../models/cargill/{param}-test")

# Create the local directory if it doesn't exist
os.makedirs(local_directory, exist_ok=True)

# List all top-level artifacts in the run
artifacts = client.list_artifacts(run_id)

# Download all artifacts, including those in directories
for artifact in artifacts:
    print(artifact.path)
    download_artifacts(client, run_id, artifact.path, local_directory)

confusion_matrix_test_lot.png


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded confusion_matrix_test_lot.png to ..\models\cargill\Acetic-test
confusion_matrix_test_mode.png


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded confusion_matrix_test_mode.png to ..\models\cargill\Acetic-test
confusion_matrix_train.png


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded confusion_matrix_train.png to ..\models\cargill\Acetic-test
deployment_pipeline


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded deployment_pipeline/origin_model_prediction.pkl to ..\models\cargill\Acetic-test
model


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/MLmodel to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/conda.yaml to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/metadata/MLmodel to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/metadata/conda.yaml to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/metadata/python_env.yaml to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/metadata/requirements.txt to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/model.pkl to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/python_env.yaml to ..\models\cargill\Acetic-test


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded model/requirements.txt to ..\models\cargill\Acetic-test


# check deployment pipeline pkl

In [27]:
deploypipeline = pkl.load(open("../models/cargill/Ash/deployment_pipeline/origin_model_prediction.pkl", "rb"))
deploypipeline

# MLFlow checks

In [71]:
from mlflow.tracking import MlflowClient
import mlflow

client = MlflowClient()
experiments = client.search_experiments()

for exp in experiments:
    print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}")


Experiment ID: 620768088714454255, Name: Classify_X24-028_nondeo_Sensory Result_multi_imba
Experiment ID: 540546849373847762, Name: Classify_X24-028_nondeo_Sour_multi_imba
Experiment ID: 818116591606453438, Name: Classify_X24-028_nondeo_Smoky_multi_imba
Experiment ID: 428823008068037852, Name: Classify_X24-028_nondeo_Putrid_multi_imba
Experiment ID: 514637445091160990, Name: Classify_X24-028_nondeo_Petroleum (Motor Oil-Rubber)_multi_imba
Experiment ID: 896108498091431601, Name: Classify_X24-028_nondeo_Painty (Solvent-Chemical)_multi_imba
Experiment ID: 939845810902120851, Name: Classify_X24-028_nondeo_Oxidized (Old Butter-Stale-Cardboard)_multi_imba
Experiment ID: 914756838721971246, Name: Classify_X24-028_nondeo_Musty_multi_imba
Experiment ID: 234111042273656590, Name: Classify_X24-028_nondeo_Fishy_multi_imba
Experiment ID: 411030358396738309, Name: Classify_X24-028_nondeo_Earthy_multi_imba
Experiment ID: 877148317299425469, Name: Classify_X24-028_nondeo_Astringent_multi_imba
Experime