In [None]:
import numpy as np
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import ast  # Import the Abstract Syntax Trees (ast) module
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestClassifier

from skactiveml.classifier import SklearnClassifier
from skactiveml.regressor import NICKernelRegressor
from skactiveml.pool import UncertaintySampling, ExpectedModelVarianceReduction
from skactiveml.pool import RandomSampling
from skactiveml.utils import unlabeled_indices, labeled_indices, MISSING_LABEL
from skactiveml.visualization import plot_decision_boundary, plot_utilities

from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV


import warnings
mlp.rcParams["figure.facecolor"] = "white"
warnings.filterwarnings("ignore")

import numpy as np
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.datasets import make_blobs
from skactiveml.pool import UncertaintySampling
from skactiveml.utils import unlabeled_indices, MISSING_LABEL
from skactiveml.classifier import SklearnClassifier

from sklearn.model_selection import train_test_split

from tqdm import tqdm

param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'n_jobs': [10],
    'random_state': [42]
}

def generate_features(df, featurespace= "mordred_descriptors"):
    
    X = np.array(df[featurespace].tolist())
    y = np.array(df['Label'].tolist())
    
    return X,y

def datasplits(X, y):
    
    # Identify non-NaN indices
    non_nan_indices = ~np.isnan(y)

    # Split only the non-NaN parts into test and part of train
    X_non_nan = X[non_nan_indices]
    y_non_nan = y[non_nan_indices]
    X_train_partial, X_test, y_train_partial, y_test = train_test_split(
        X_non_nan, y_non_nan, test_size=0.25, random_state=24, stratify=y_non_nan
    )

    # Combine the non-selected non-NaN data back with NaN-containing rows for the full train set
    # Identify indices for rows used in X_train_partial (inverse operation might be needed depending on how you track selected indices)
    # This is a conceptual step; specifics depend on ensuring we don't double-count or omit any rows

    # For simplicity, let's include all original data in X_train, then remove X_test entries
    X_train = np.concatenate((X_train_partial, X[~non_nan_indices]), axis=0)
    y_train = np.concatenate((y_train_partial, y[~non_nan_indices]), axis=0)

    # Verifications
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    print(f"Number of NaNs in y_test: {np.isnan(y_test).sum()}")  # Should be 0
    print(f"Number of NaNs in y_train: {np.isnan(y_train).sum()}")  # Original number minus the ones in y_test
    
    return (X_train, y_train, X_test, y_test)


def initial_model(X_train, y_train, X_test, y_test):
    
    non_nan_indices_y_train = ~np.isnan(y_train)
    # Getting unique values and their counts
    unique_values, counts = np.unique(y_train[non_nan_indices_y_train], return_counts=True)
    # Combining unique values and counts into a dictionary for a similar output to pandas.Series.value_counts()
    value_counts = dict(zip(unique_values, counts))
    print("Total data value counrs : " , value_counts)


    clf = RandomForestClassifier(random_state=42)

    searcher = RandomizedSearchCV(clf, param_distributions, n_iter=10, scoring='balanced_accuracy', 
                                          cv=5, n_jobs =-1, random_state=42)
    searcher.fit(X_train[non_nan_indices_y_train], y_train[non_nan_indices_y_train])

    # Update clf to the best estimator
    clf = SklearnClassifier(
                searcher.best_estimator_,
                classes=np.unique(y_test),
                random_state=0
            )

    clf.fit(X_train[non_nan_indices_y_train], y_train[non_nan_indices_y_train])

    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class

    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_proba)  # Ensure y_test is appropriately encoded for binary classification

    print(f'The balanced accuracy score is {balanced_accuracy}.')
    print(f'The AUC score is {auc_score}.')
    
    return

print("Reading data")
df = pd.read_csv("data/DIRIL_features_v2.csv.gz", compression="gzip")
df['morgan_fingerprint'] = df['morgan_fingerprint'].apply(ast.literal_eval)
df['mordred_descriptors'] = df['mordred_descriptors'].apply(ast.literal_eval)

def active_learning(featurespace ="mordred_descriptors" ,n_cycles=1000):
    
    X,y = generate_features(df, featurespace= "mordred_descriptors")
    
    X_train, y_train, X_test, y_test = datasplits(X, y)
    

    # Create classifier and query strategy.
    clf = SklearnClassifier(
        RandomForestClassifier(random_state=42, n_jobs=10),
        classes=np.unique(y_test),
        random_state=0
        )

    qs = UncertaintySampling(method='entropy')
    #qs = RandomSampling()
    
    initial_model(X_train, y_train, X_test, y_test)
    
    accuracy_scores = []
    auc_scores = []
    sample_counts = []

    non_nan_indices_y_train = ~np.isnan(y_train)
    clf.fit(X_train[non_nan_indices_y_train], y_train[non_nan_indices_y_train])

    for c in tqdm(range(n_cycles), desc='Processing cycles'):

        # plotting
        unlbld_idx = unlabeled_indices(y_train)
        lbld_idx = labeled_indices(y_train)

        if c%1==0:

            # print(f'After {c} iterations:')

            # Getting unique values and their counts
            unique_values, counts = np.unique(y_train, return_counts=True)
            # Combining unique values and counts into a dictionary for a similar output to pandas.Series.value_counts()
            value_counts = dict(zip(unique_values, counts))

            # print("Total labelled data : " , len(lbld_idx))
            # print("Total data value counrs : " , value_counts)

            y_pred = clf.predict(X_test)
            y_proba = clf.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class

            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            auc_score = roc_auc_score(y_test, y_proba)  # Ensure y_test is appropriately encoded for binary classification

            # print(f'After {c} iterations:')
            # print(f'The balanced accuracy score is {balanced_accuracy}.')
            # print(f'The AUC score is {auc_score}.')

            accuracy_scores.append(balanced_accuracy)
            auc_scores.append(auc_score)
            sample_counts.append(len(lbld_idx))


        query_idx = qs.query(X=X_train, y=y_train, clf=clf, batch_size=1)
        #query_idx = qs.query(X=X_train, y=y_train)
        y_train[query_idx] = clf.predict(X_train[query_idx])

        if c%1==0:

            # Optimize hyperparameters using RandomizedSearchCV
            clf = RandomForestClassifier(random_state=42)
            non_nan_indices_y_train = ~np.isnan(y_train)
            searcher = RandomizedSearchCV(clf, param_distributions, n_iter=10, scoring='balanced_accuracy', 
                                          cv=5, n_jobs =-1, random_state=42)
            searcher.fit(X_train[non_nan_indices_y_train], y_train[non_nan_indices_y_train])

            # Update clf to the best estimator
            clf = SklearnClassifier(
                searcher.best_estimator_,
                classes=np.unique(y_test),
                random_state=0
            )

            clf.fit(X_train, y_train)

    plt.figure(figsize=(10, 5))
    plt.plot(sample_counts, accuracy_scores, marker='o', label='Balanced Accuracy')
    plt.xlabel('Number of Labeled Samples + Metabolites')
    plt.ylabel('Balanced Accuracy Score')
    plt.grid(True)
    plt.legend()
    plt.savefig(f'./results_plots/accuracy_vs_samples{featurespace}.png') 
    plt.show()
    
    plt.figure(figsize=(10, 5))
    plt.plot(sample_counts, auc_scores, marker='o', label='Balanced Accuracy')
    plt.xlabel('Number of Labeled Samples + Metabolites')
    plt.ylabel('AUCROC')
    plt.grid(True)
    plt.legend()
    plt.savefig(f'./results_plots/auc_vs_samples{featurespace}.png') 
    plt.show()

    return

print("Runnin Models")
active_learning(featurespace ="mordred_descriptors", n_cycles=1000)
active_learning(featurespace ="morgan_fingerprint", n_cycles=1000)