In [11]:
import pickle
import warnings
import pandas as pd
from tqdm import tqdm
from datetime import datetime

from utils.ml_train import train_models
from utils.performance import calculate_metrics

warnings.filterwarnings('ignore')

year = datetime.now().date().year
month = datetime.now().date().month
day = datetime.now().date().day

### Select feature set

In [None]:
#Either "DRFP" or "QM"
features = "DRFP"

In [None]:
with open("data/HTE_BH_10_iter_9-3_Clusters_v2025-11.pkl", 'rb') as f:
    df_HTE_BH = pickle.load(f)

# Initialize an empty DataFrame to hold the performance results
df_perf = pd.DataFrame(columns=['Model', 'train_data', 'test_data', 
                                'Balanced Accuracy', 'ROC AUC', 'F1 Score', 'F0 Score', 
                                'AU-PR-C', 'Precision 1', 'Recall 1',
                                'Balanced Accuracy Avg', 'Balanced Accuracy Std',
                                'ROC AUC Avg', 'ROC AUC Std',
                                'F1 Score Avg', 'F1 Score Std', 
                                'F0 Score Avg', 'F0 Score Std', 
                                'AU-PR-C Avg', 'AU-PR-C Std',
                                'Precision 1 Avg', 'Precision 1 Std',
                                'Recall 1 Avg', 'Recall 1 Std'])

# Data for models
train_data_source = []
test_data_source = [] 

model_names = ["rf_model", "gradient_boosting_model", "lr_model", "mlp_model", "knn_model", "gaussianNB_model"]
model_source = []


train_sources = list(df_HTE_BH.Source.unique())

# Prepare structure for DataFrame
for main_source in train_sources:
    for model in model_names:
        model_source.append(model)
        train_data_source.append(main_source) 
        test_data_source.append("Outter Clusters")    

df_perf['Model'] = model_source
df_perf['train_data'] = train_data_source
df_perf['test_data'] = test_data_source

# Ensure that metrics columns are initialized with empty lists for all rows
metrics_columns = ['Balanced Accuracy', 'ROC AUC', 'F1 Score', 'F0 Score', 
                   'AU-PR-C', 'Precision 1', 'Recall 1']

for metric in metrics_columns:
    df_perf[metric] = [[] for _ in range(len(df_perf))]

# Run data preparation and training with different random states
for run in tqdm(range(0, 10), desc="Total Runs"):
    
    print(f"Starting Run {run + 1}")
    
    # Prepare the data for each run with a different random state
    random_state_key = 0 + run  # Use different random states
    
    # Now loop through each main_source and model
    for main_source, test_source in zip(train_data_source[0::len(model_names)], test_data_source[0::len(model_names)]):
        
        print(f"Training on {main_source} and testing on {test_source} for run {run + 1}")

        # Prepare training data for main_source
        df_train = df_HTE_BH[df_HTE_BH["Source"] == main_source]

        kcn = df_train[f"iteration_{run} cluster"].value_counts()
        df_eval = df_HTE_BH[~df_HTE_BH[f"iteration_{run} cluster"].isin(list(kcn[kcn > 30].index))] 

        #ensure aryls and amines in train are not in test
        df_eval = df_eval[~df_eval['Aryl SMILES'].isin(list(df_train['Aryl SMILES'].unique()))]
        df_eval = df_eval[~df_eval['Amine SMILES'].isin(list(df_train['Amine SMILES'].unique()))]

        # Shuffle the training data for each run
        df_train = df_train.sample(frac=1, random_state=random_state_key)

        # Train models
        models, scaler = train_models(df_train, model_names, feats_col = features)
        metrics, conf_bucket_metrics = calculate_metrics(models, scaler, df_eval, feats_col = features)

        # Store each run's metrics
        for model in models.keys():
            for metric in metrics[model].keys():
                # Append the result of this run to the corresponding list
                df_perf.loc[(df_perf["Model"] == model) & 
                            (df_perf["train_data"] == main_source) & 
                            (df_perf["test_data"] == test_source), metric].apply(lambda x: x.append(metrics[model][metric]))

    # After all runs, calculate averages and standard deviations
    for model in model_names:
        for main_source, test_source in zip(train_data_source[0::len(model_names)], test_data_source[0::len(model_names)]):

            for metric in metrics_columns:
                metric_values = df_perf.loc[(df_perf["Model"] == model) & 
                                            (df_perf["train_data"] == main_source) & 
                                            (df_perf["test_data"] == test_source), metric].values[0]
                if metric_values is not None and isinstance(metric_values, list):
                    avg = np.mean(metric_values)
                    std = np.std(metric_values)
                    df_perf.loc[(df_perf["Model"] == model) & 
                                (df_perf["train_data"] == main_source) & 
                                (df_perf["test_data"] == test_source), f"{metric} Avg"] = avg
                    df_perf.loc[(df_perf["Model"] == model) & 
                                (df_perf["train_data"] == main_source) & 
                                (df_perf["test_data"] == test_source), f"{metric} Std"] = std
                
sel_metrics = ['ROC AUC Avg', 'F1 Score Avg']

best_performance = df_perf.groupby('train_data').apply(
    lambda x: x.loc[x[sel_metrics].sum(axis=1).idxmax()])

best_performance.reset_index(drop=True, inplace=True)

best_performance = best_performance[['Model', 'train_data', 'test_data', 'Balanced Accuracy Avg', 'Balanced Accuracy Std', 'ROC AUC Avg',
                                     'ROC AUC Std', 'F1 Score Avg', 'F1 Score Std', 'F0 Score Avg',
                                     'F0 Score Std', 'AU-PR-C Avg', 'AU-PR-C Std', 'Precision 1 Avg',
                                     'Precision 1 Std', 'Recall 1 Avg', 'Recall 1 Std']]

df_perf.to_csv(f"results/Data_Source_{features}_all_{year}-{month}-{day}.csv")
best_performance.to_csv(f"results/Data_Source_{features}_best_{year}-{month}-{day}.csv")