# Model Development

In [None]:
%%capture
# mount your google drive to this notebook
from google.colab import drive
drive.flush_and_unmount()
drive.mount("mnt", force_remount=True)

In [None]:
%%capture
!pip install shap

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import shap

In [None]:
df = pd.read_csv("mnt/MyDrive/NASA/augmented_gene_expression_data.csv")
results_df = pd.DataFrame(columns=['Model', 'Target Variable', 'MSE', 'RMSE', 'MAE', 'R2', 'R', "Selected Features", "Sorted Shap Values"])
df.drop(columns=['Unnamed: 0'], inplace=True)
target_columns = df.columns[-2:].to_list()

print("Dataframe shape:", df.shape)

# Extract environments from source_name
df['Environment'] = df['source_name'].apply(lambda x: f"{x.split('_')[0]}_{x.split('_')[-1]}" if x.count('_') == 2 else x.split('_')[0])
environments = df['Environment'].unique()
environments

Dataframe shape: (264, 12247)


array(['Earth', 'SFug', 'SF1g', 'Earth_log', 'SFug_log', 'SF1g_log',
       'Earth_sqrt', 'SFug_sqrt', 'SF1g_sqrt', 'Earth_boxcox',
       'SFug_boxcox', 'SF1g_boxcox', 'Earth_clr', 'SFug_clr', 'SF1g_clr',
       'Earth_deseq2', 'SFug_deseq2', 'SF1g_deseq2', 'Earth_zscore',
       'SFug_zscore', 'SF1g_zscore', 'Earth_quantile', 'SFug_quantile',
       'SF1g_quantile', 'Earth_minmax', 'SFug_minmax', 'SF1g_minmax',
       'Earth_robust', 'SFug_robust', 'SF1g_robust', 'Earth_yeojohnson',
       'SFug_yeojohnson', 'SF1g_yeojohnson'], dtype=object)

In [None]:
raw_environments = np.array([x.split('_')[0] for x in environments])
raw_environments = np.unique(raw_environments)
raw_environments

array(['Earth', 'SF1g', 'SFug'], dtype='<U5')

In [None]:
synth_environments = np.array([env for env in environments if env not in environments])
synth_environments

array([], dtype=float64)

In [None]:
target_columns

['th_positive_cells', 'repo_glial_cells']

In [None]:
target_variables = df[target_columns]
target_variables.describe()

Unnamed: 0,th_positive_cells,repo_glial_cells
count,264.0,264.0
mean,24.391343,93.868016
std,39.085647,192.667465
min,-5.199338,-5.199338
25%,0.082538,-0.007168
50%,1.141754,1.062169
75%,45.651284,27.588908
max,120.545144,810.810964


In [None]:
drop_columns = target_columns.copy()
drop_columns.extend(['Environment', 'source_name'])
drop_columns

['th_positive_cells', 'repo_glial_cells', 'Environment', 'source_name']

In [None]:
def ann_regr(data, target_column, n_features=50):

    # Split the data
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Environment"])
    val_data, train_data = train_test_split(train_data, train_size=0.1625, random_state=42, stratify=train_data['Environment'])

    # Feature selection using RFE on validation data
    X_val = val_data.drop(columns=drop_columns)
    y_val = val_data[target_column]

    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_val, y_val)

    # Select the important features
    X_val_selected = selector.transform(X_val)

    # Scale the features
    scaler = StandardScaler()
    X_val_scaled = scaler.fit_transform(X_val_selected)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'hidden_layer_sizes': [(64, 32, 16), (128, 64, 32), (32, 16, 8)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [800, 1200, 1600]
    }

    # Initialize the MLP model
    mlp = MLPRegressor(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_val_scaled, y_val)

    # Get the best model
    best_mlp = grid_search.best_estimator_

    # k-fold cross validation, trains in one, tests in another. eliminates biases
    for e_train in environments:
        for e_test in environments:

            if e_train == e_test:
                continue

            print("Training in raw environment:", e_train)
            print("Testing in transformed environment:", e_test)

            train_data_env = train_data[train_data['Environment'] == e_train]
            test_data_env = train_data[train_data['Environment'] == e_test]

            if train_data_env.empty or test_data_env.empty:
                continue

            X_train = train_data_env.drop(columns=drop_columns)
            y_train = train_data_env[target_column]

            X_test = test_data_env.drop(columns=drop_columns)
            y_test = test_data_env[target_column]

            X_train_selected = selector.transform(X_train)
            X_train_scaled = scaler.transform(X_train_selected)

            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)

            # Train the model with the best parameters
            best_mlp.fit(X_train_scaled, y_train)

    # Final evaluation on the 20% test set
    X_test = test_data.drop(columns=drop_columns)
    y_test = test_data[target_column]

    X_test_selected = selector.transform(X_test)
    X_test_scaled = scaler.transform(X_test_selected)

    y_pred = best_mlp.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)

    selected_features = X_test.columns[selector.support_]

    # Compute SHAP values to get feature importance
    explainer = shap.KernelExplainer(best_mlp.predict, X_test_scaled)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Combine feature names with SHAP importance values
    shap_feature_importance = dict(zip(selected_features, shap_importance))

    # Sort features by SHAP importance, high to low
    sorted_shap_features = sorted(shap_feature_importance.items(), key=lambda x: x[1], reverse=True)
    sorted_shap_features_dict = dict(sorted_shap_features)

    # Store the final test results
    results_df.loc[len(results_df)] = [
        "ANN_Final_Test", target_column, mse, rmse, mae, r2, r, ','.join(selected_features),
        str(sorted_shap_features_dict)         # Store all SHAP values sorted by importance
    ]

    return results_df

In [None]:
def lr_regr(data, target_column, n_features=50):

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Environment"])

    val_data, train_data = train_test_split(train_data, train_size=0.1625, random_state=42, stratify=train_data['Environment'])

    # Feature selection using RFE on validation data
    X_val = val_data.drop(columns=drop_columns)
    y_val = val_data[target_column]

    selector = RFE(estimator=LinearRegression(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_val, y_val)

    # Select the important features
    X_val_selected = selector.transform(X_val)

    # Scale the features
    scaler = StandardScaler()
    X_val_scaled = scaler.fit_transform(X_val_selected)

    # Initialize the Linear Regression model
    lr = LinearRegression()

    # No need for GridSearchCV, as LinearRegression has no hyperparameters to tune
    best_lr = lr

    # Training in raw environment and testing in transformed environment
    for e_train in environments:
        for e_test in environments:

            if e_train == e_test:
                continue

            print("Training in raw environment:", e_train)
            print("Testing in transformed environment:", e_test)

            train_data_env = train_data[train_data['Environment'] == e_train]
            test_data_env = train_data[train_data['Environment'] == e_test]

            if train_data_env.empty or test_data_env.empty:
                continue

            X_train = train_data_env.drop(columns=drop_columns)
            y_train = train_data_env[target_column]

            X_test = test_data_env.drop(columns=drop_columns)
            y_test = test_data_env[target_column]

            X_train_selected = selector.transform(X_train)
            X_train_scaled = scaler.transform(X_train_selected)

            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)

            # Train the model with the selected features
            best_lr.fit(X_train_scaled, y_train)

    # Final evaluation on the 20% test set
    X_test = test_data.drop(columns=drop_columns)
    y_test = test_data[target_column]

    X_test_selected = selector.transform(X_test)
    X_test_scaled = scaler.transform(X_test_selected)

    y_pred = best_lr.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)

    selected_features = X_test.columns[selector.support_]

    # Extracting the coefficients (weights) of the linear regression model
    coefficients = best_lr.coef_
    coefficients_str = ','.join(map(str, coefficients))

    # Compute SHAP values to get feature importance
    explainer = shap.KernelExplainer(best_lr.predict, X_test_scaled)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Combine feature names with SHAP importance values
    shap_feature_importance = dict(zip(selected_features, shap_importance))

    # Sort features by SHAP importance
    sorted_shap_importance = sorted(shap_feature_importance.items(), key=lambda x: x[1], reverse=True)
    sorted_shap_importance_dict = dict(sorted_shap_importance)

    # Store the final test results
    results_df.loc[len(results_df)] = [
        "Linear_Regression_Final_Test", target_column, mse, rmse, mae, r2, r, ','.join(selected_features),
        str(sorted_shap_importance_dict)  # Store all SHAP values, sorted by importance
    ]

    return results_df


In [None]:
def rf_regr(data, target_column, n_features=50):

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Environment"])

    val_data, train_data = train_test_split(train_data, train_size=0.1625, random_state=42, stratify=train_data['Environment'])

    # Feature selection using RFE on validation data
    X_val = val_data.drop(columns=drop_columns)
    y_val = val_data[target_column]

    selector = RFE(estimator=RandomForestRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_val, y_val)

    # Select the important features
    X_val_selected = selector.transform(X_val)

    # Scale the features
    scaler = StandardScaler()
    X_val_scaled = scaler.fit_transform(X_val_selected)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Initialize the Random Forest model
    rf = RandomForestRegressor(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_val_scaled, y_val)

    # Get the best model
    best_rf = grid_search.best_estimator_

    # Training in raw environment and testing in transformed environment
    for e_train in environments:
        for e_test in environments:

            if e_train == e_test:
                continue

            print("Training in raw environment:", e_train)
            print("Testing in transformed environment:", e_test)

            train_data_env = train_data[train_data['Environment'] == e_train]
            test_data_env = train_data[train_data['Environment'] == e_test]

            if train_data_env.empty or test_data_env.empty:
                continue

            X_train = train_data_env.drop(columns=drop_columns)
            y_train = train_data_env[target_column]

            X_test = test_data_env.drop(columns=drop_columns)
            y_test = test_data_env[target_column]

            X_train_selected = selector.transform(X_train)
            X_train_scaled = scaler.transform(X_train_selected)

            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)

            # Train the model with the best parameters
            best_rf.fit(X_train_scaled, y_train)

    # Final evaluation on the 20% test set
    X_test = test_data.drop(columns=drop_columns)
    y_test = test_data[target_column]

    X_test_selected = selector.transform(X_test)
    X_test_scaled = scaler.transform(X_test_selected)

    y_pred = best_rf.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)

    selected_features = X_test.columns[selector.support_]

    # Compute SHAP values to get feature importance
    explainer = shap.TreeExplainer(best_rf)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Combine feature names with SHAP importance values
    shap_feature_importance = dict(zip(selected_features, shap_importance))

    # Sort features by SHAP importance
    sorted_shap_importance = sorted(shap_feature_importance.items(), key=lambda x: x[1], reverse=True)
    sorted_shap_importance_dict = dict(sorted_shap_importance)

    # Store the final test results
    results_df.loc[len(results_df)] = [
        "Random_Forest_Final_Test", target_column, mse, rmse, mae, r2, r, ','.join(selected_features),
        str(sorted_shap_importance_dict)  # Store all SHAP values, sorted by importance
    ]

    return results_df

In [None]:
def ridge_regr(data, target_column, n_features=50):

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Environment"])

    val_data, train_data = train_test_split(train_data, train_size=0.1625, random_state=42, stratify=train_data['Environment'])

    # Feature selection using RFE on validation data
    X_val = val_data.drop(columns=drop_columns)
    y_val = val_data[target_column]

    selector = RFE(estimator=Ridge(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_val, y_val)

    # Select the important features
    X_val_selected = selector.transform(X_val)

    # Scale the features
    scaler = StandardScaler()
    X_val_scaled = scaler.fit_transform(X_val_selected)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'alpha': [0.01, 0.1, 1.0, 10.0],  # Regularization strength
        'max_iter': [1000, 5000, 10000]
    }

    # Initialize the Ridge model
    ridge = Ridge(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_val_scaled, y_val)

    # Get the best model
    best_ridge = grid_search.best_estimator_

    # Training in raw environment and testing in transformed environment
    for e_train in environments:
        for e_test in environments:

            if e_train == e_test:
                continue

            print("Training in raw environment:", e_train)
            print("Testing in transformed environment:", e_test)

            train_data_env = train_data[train_data['Environment'] == e_train]
            test_data_env = train_data[train_data['Environment'] == e_test]

            if train_data_env.empty or test_data_env.empty:
                continue

            X_train = train_data_env.drop(columns=drop_columns)
            y_train = train_data_env[target_column]

            X_test = test_data_env.drop(columns=drop_columns)
            y_test = test_data_env[target_column]

            X_train_selected = selector.transform(X_train)
            X_train_scaled = scaler.transform(X_train_selected)

            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)

            # Train the model with the best parameters
            best_ridge.fit(X_train_scaled, y_train)

    # Final evaluation on the 20% test set
    X_test = test_data.drop(columns=drop_columns)
    y_test = test_data[target_column]

    X_test_selected = selector.transform(X_test)
    X_test_scaled = scaler.transform(X_test_selected)

    y_pred = best_ridge.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)

    selected_features = X_test.columns[selector.support_]

    # Compute SHAP values to get feature importance
    explainer = shap.KernelExplainer(best_ridge.predict, X_test_scaled)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Combine feature names with SHAP importance values
    shap_feature_importance = dict(zip(selected_features, shap_importance))

    # Sort features by SHAP importance (high to low)
    sorted_shap_features = sorted(shap_feature_importance.items(), key=lambda x: x[1], reverse=True)
    sorted_shap_features_dict = dict(sorted_shap_features)

    # Extracting the coefficients (weights) of the Ridge model
    coefficients = best_ridge.coef_
    coefficients_str = ','.join(map(str, coefficients))

    # Store the final test results
    results_df.loc[len(results_df)] = [
        "Ridge_Final_Test", target_column, mse, rmse, mae, r2, r,
        ','.join(sorted_shap_features_dict.keys()), str(sorted_shap_features_dict)  # Store SHAP values for all features sorted
    ]

    return results_df

In [None]:
def lasso_regr(data, target_column, n_features=50):

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Environment"])

    val_data, train_data = train_test_split(train_data, train_size=0.1625, random_state=42, stratify=train_data['Environment'])

    # Feature selection using RFE on validation data
    X_val = val_data.drop(columns=drop_columns)
    y_val = val_data[target_column]

    selector = RFE(estimator=Lasso(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_val, y_val)

    # Select the important features
    X_val_selected = selector.transform(X_val)

    # Scale the features
    scaler = StandardScaler()
    X_val_scaled = scaler.fit_transform(X_val_selected)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'alpha': [0.01, 0.1, 1.0, 10.0],  # Regularization strength
        'max_iter': [1000, 5000, 10000]
    }

    # Initialize the Lasso model
    lasso = Lasso(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_val_scaled, y_val)

    # Get the best model
    best_lasso = grid_search.best_estimator_

    # Training in raw environment and testing in transformed environment
    for e_train in environments:
        for e_test in environments:

            if e_train == e_test:
                continue

            print("Training in raw environment:", e_train)
            print("Testing in transformed environment:", e_test)

            train_data_env = train_data[train_data['Environment'] == e_train]
            test_data_env = train_data[train_data['Environment'] == e_test]

            if train_data_env.empty or test_data_env.empty:
                continue

            X_train = train_data_env.drop(columns=drop_columns)
            y_train = train_data_env[target_column]

            X_test = test_data_env.drop(columns=drop_columns)
            y_test = test_data_env[target_column]

            X_train_selected = selector.transform(X_train)
            X_train_scaled = scaler.transform(X_train_selected)

            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)

            # Train the model with the best parameters
            best_lasso.fit(X_train_scaled, y_train)

    # Final evaluation on the 20% test set
    X_test = test_data.drop(columns=drop_columns)
    y_test = test_data[target_column]

    X_test_selected = selector.transform(X_test)
    X_test_scaled = scaler.transform(X_test_selected)

    y_pred = best_lasso.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)

    selected_features = X_test.columns[selector.support_]

    # Compute SHAP values to get feature importance
    explainer = shap.KernelExplainer(best_lasso.predict, X_test_scaled)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Combine feature names with SHAP importance values
    shap_feature_importance = dict(zip(selected_features, shap_importance))

    # Sort features by SHAP importance (high to low)
    sorted_shap_features = sorted(shap_feature_importance.items(), key=lambda x: x[1], reverse=True)

    # Extracting the coefficients (weights) of the Lasso model
    coefficients = best_lasso.coef_
    coefficients_str = ','.join(map(str, coefficients))

    # Store the final test results
    results_df.loc[len(results_df)] = [
        "Lasso_Final_Test", target_column, mse, rmse, mae, r2, r,
        ','.join(selected_features), str(sorted_shap_features)  # Store sorted SHAP values
    ]

    return results_df


In [None]:
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd
import shap

def svm_regr(data, target_column, n_features=50):

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Environment"])

    val_data, train_data = train_test_split(train_data, train_size=0.1625, random_state=42, stratify=train_data['Environment'])

    # Feature selection using RFE on validation data
    X_val = val_data.drop(columns=drop_columns)
    y_val = val_data[target_column]

    selector = RFE(estimator=SVR(kernel='linear'), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_val, y_val)

    # Select the important features
    X_val_selected = selector.transform(X_val)

    # Scale the features
    scaler = StandardScaler()
    X_val_scaled = scaler.fit_transform(X_val_selected)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'C': [0.01, 0.1, 1.0, 10.0],  # Regularization strength (soft margin parameter)
        'epsilon': [0.001, 0.01, 0.1],  # Insensitivity parameter
    }

    # Initialize the SVR model
    svm = SVR(kernel='linear')

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_val_scaled, y_val)

    # Get the best model
    best_svm = grid_search.best_estimator_

    # Training in raw environment and testing in transformed environment
    for e_train in environments:
        for e_test in environments:

            if e_train == e_test:
                continue

            print("Training in raw environment:", e_train)
            print("Testing in transformed environment:", e_test)

            train_data_env = train_data[train_data['Environment'] == e_train]
            test_data_env = train_data[train_data['Environment'] == e_test]

            if train_data_env.empty or test_data_env.empty:
                continue

            X_train = train_data_env.drop(columns=drop_columns)
            y_train = train_data_env[target_column]

            X_test = test_data_env.drop(columns=drop_columns)
            y_test = test_data_env[target_column]

            X_train_selected = selector.transform(X_train)
            X_train_scaled = scaler.transform(X_train_selected)

            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)

            # Train the model with the best parameters
            best_svm.fit(X_train_scaled, y_train)

    # Final evaluation on the 20% test set
    X_test = test_data.drop(columns=drop_columns)
    y_test = test_data[target_column]

    X_test_selected = selector.transform(X_test)
    X_test_scaled = scaler.transform(X_test_selected)

    y_pred = best_svm.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)

    selected_features = X_test.columns[selector.support_]

    # Compute SHAP values to get feature importance
    explainer = shap.KernelExplainer(best_svm.predict, X_test_scaled)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_importance = np.abs(shap_values).mean(axis=0)

    # Combine feature names with SHAP importance values
    shap_feature_importance = dict(zip(selected_features, shap_importance))

    # Sort features by SHAP importance (high to low)
    shap_feature_importance_sorted = sorted(shap_feature_importance.items(), key=lambda x: x[1], reverse=True)
    shap_feature_importance_dict = dict(shap_feature_importance_sorted)

    # Store the final test results with all SHAP values
    results_df.loc[len(results_df)] = [
        "SVM_Final_Test", target_column, mse, rmse, mae, r2, r,
        ','.join(selected_features), str(shap_feature_importance_dict)  # Store all SHAP values sorted
    ]

    return results_df

In [None]:
np.seterr(invalid='ignore')

results_csv_path = 'mnt/MyDrive/NASA/repo_model_results_n1500.csv'
# Train and evaluate the models
for target in target_columns:

  print(f"*******************************************************\nTraining MLP for {target}...")
  ann_regr(df, target, n_features=1500)
  results_df.to_csv(results_csv_path, index=False)
  print(f"*******************************************************\nTraining Linear Regression for {target}...")
  lr_regr(df, target, n_features=1500)
  results_df.to_csv(results_csv_path, index=False)
  print(f"*******************************************************\nTraining Ridge Regression for {target}...")
  ridge_regr(df, target, n_features=1500)
  results_df.to_csv(results_csv_path, index=False)
  print(f"*******************************************************\nTraining Lasso Regression for {target}...")
  lasso_regr(df, target, n_features=1500)
  results_df.to_csv(results_csv_path, index=False)
  print(f"*******************************************************\nTraining Support Vector Machine for {target}...")
  svm_regr(df, "repo_glial_cells", n_features=1500)
  results_df.to_csv(results_csv_path, index=False)
  print(f"*******************************************************\nTraining Random Forest for {target}...")
  rf_regr(df, target, n_features=1500)
  results_df.to_csv(results_csv_path, index=False)


*******************************************************
Training MLP for th_positive_cells...
Fitting 3 folds for each of 216 candidates, totalling 648 fits


99 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
99 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 751, in fit
    return self._fit(X, y, incremental=False)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 495, in _fit
 

Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth
Testing in transformed environment: SF1g_boxcox
Training in raw environment: Earth
Testing in transformed environment: Earth_clr
Training in raw environment: Ear



Training in raw environment: SFug_zscore
Testing in transformed environment: SFug




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_log




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_log




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_log




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_sqrt




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_sqrt




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_sqrt




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_boxcox




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_boxcox




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_boxcox




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_clr




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_clr




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_clr




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_deseq2




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_deseq2




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_deseq2




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_zscore




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_zscore




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_quantile




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_quantile




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_quantile




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_minmax




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_minmax




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_minmax




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_robust




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_robust




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_robust




Training in raw environment: SFug_zscore
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: SFug_zscore
Testing in transformed environment: SFug_yeojohnson




Training in raw environment: SFug_zscore
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_log




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_log




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_log




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_sqrt




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_sqrt




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_sqrt




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_boxcox




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_boxcox




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_boxcox




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_clr




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_clr




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_clr




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_deseq2




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_deseq2




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_deseq2




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_zscore




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_zscore




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_quantile




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_quantile




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_quantile




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_minmax




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_minmax




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_minmax




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_robust




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_robust




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_robust




Training in raw environment: SF1g_zscore
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: SF1g_zscore
Testing in transformed environment: SFug_yeojohnson




Training in raw environment: SF1g_zscore
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_log




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_log




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_log




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_sqrt




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_sqrt




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_sqrt




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_boxcox




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_boxcox




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_boxcox




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_clr




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_clr




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_clr




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_deseq2




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_deseq2




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_deseq2




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_zscore




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_zscore




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_zscore




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_quantile




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_quantile




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_minmax




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_minmax




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_minmax




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_robust




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_robust




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_robust




Training in raw environment: Earth_quantile
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: Earth_quantile
Testing in transformed environment: SFug_yeojohnson




Training in raw environment: Earth_quantile
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_log




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_log




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_log




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_sqrt




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_sqrt




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_sqrt




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_boxcox




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_boxcox




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_boxcox




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_clr




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_clr




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_clr




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_deseq2




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_deseq2




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_deseq2




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_zscore




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_zscore




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_zscore




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_quantile




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_quantile




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_minmax




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_minmax




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_minmax




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_robust




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_robust




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_robust




Training in raw environment: SFug_quantile
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: SFug_quantile
Testing in transformed environment: SFug_yeojohnson




Training in raw environment: SFug_quantile
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_log




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_log




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_log




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_sqrt




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_sqrt




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_sqrt




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_boxcox




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_boxcox




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_boxcox




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_clr




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_clr




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_clr




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_deseq2




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_deseq2




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_deseq2




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_zscore




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_zscore




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_zscore




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_quantile




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_quantile




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_minmax




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_minmax




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_minmax




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_robust




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_robust




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_robust




Training in raw environment: SF1g_quantile
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: SF1g_quantile
Testing in transformed environment: SFug_yeojohnson




Training in raw environment: SF1g_quantile
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: Earth_minmax
Testing in transformed environment: Earth
Training in raw environment: Earth_minmax
Testing in transformed environment: SFug
Training in raw environment: Earth_minmax
Testing in transformed environment: SF1g
Training in raw environment: Earth_minmax
Testing in transformed environment: Earth_log
Training in raw environment: Earth_minmax
Testing in transformed environment: SFug_log
Training in raw environment: Earth_minmax
Testing in transformed environment: SF1g_log
Training in raw environment: Earth_minmax
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth_minmax
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth_minmax
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth_minmax
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth_minmax
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth_minmax




Training in raw environment: SFug_robust
Testing in transformed environment: SFug




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_log




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_log




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_log




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_sqrt




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_sqrt




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_sqrt




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_boxcox




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_boxcox




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_boxcox




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_clr




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_clr




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_clr




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_deseq2




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_deseq2




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_deseq2




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_zscore




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_zscore




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_zscore




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_quantile




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_quantile




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_quantile




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_minmax




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_minmax




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_minmax




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_robust




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_robust




Training in raw environment: SFug_robust
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: SFug_robust
Testing in transformed environment: SFug_yeojohnson




Training in raw environment: SFug_robust
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: SF1g_robust
Testing in transformed environment: Earth
Training in raw environment: SF1g_robust
Testing in transformed environment: SFug
Training in raw environment: SF1g_robust
Testing in transformed environment: SF1g
Training in raw environment: SF1g_robust
Testing in transformed environment: Earth_log
Training in raw environment: SF1g_robust
Testing in transformed environment: SFug_log
Training in raw environment: SF1g_robust
Testing in transformed environment: SF1g_log
Training in raw environment: SF1g_robust
Testing in transformed environment: Earth_sqrt
Training in raw environment: SF1g_robust
Testing in transformed environment: SFug_sqrt
Training in raw environment: SF1g_robust
Testing in transformed environment: SF1g_sqrt
Training in raw environment: SF1g_robust
Testing in transformed environment: Earth_boxcox
Training in raw environment: SF1g_robust
Testing in transformed environment: SFug_boxcox
Training in raw environment: SF1g_robust
Testing in t



Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_log




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_log




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_log




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_sqrt




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_sqrt




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_sqrt




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_boxcox




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_boxcox




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_boxcox




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_clr




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_clr




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_clr




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_deseq2




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_deseq2




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_deseq2




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_zscore




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_zscore




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_zscore




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_quantile




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_quantile




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_quantile




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_minmax




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_minmax




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_minmax




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_robust




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SFug_robust




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_robust




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: Earth_yeojohnson




Training in raw environment: SFug_yeojohnson
Testing in transformed environment: SF1g_yeojohnson




Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: Earth
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SFug
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SF1g
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: Earth_log
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SFug_log
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SF1g_log
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: Earth_sqrt
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SFug_sqrt
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SF1g_sqrt
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: Earth_boxcox
Training in raw environment: SF1g_yeojohnson
Testing in transformed environment: SFug_boxcox
Training 

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Linear Regression for th_positive_cells...
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth
Testing in transformed environment: SF1g_boxcox
Train

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Ridge Regression for th_positive_cells...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment:

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Lasso Regression for th_positive_cells...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth
Testing in transformed environment: SF1g_boxcox
Training in raw environment: Earth
Testing in transf

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Support Vector Machine for th_positive_cells...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw enviro

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training MLP for repo_glial_cells...
Fitting 3 folds for each of 216 candidates, totalling 648 fits


144 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
144 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 751, in fit
    return self._fit(X, y, incremental=False)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neural_network/_multilayer_perceptron.py", line 495, in _fit

Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth
Testing in transformed environment: SF1g_boxcox
Training in raw environment: Earth
Testing in transformed environment: Earth_clr
Training in raw environment: Ear

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Linear Regression for repo_glial_cells...
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth
Testing in transformed environment: SF1g_boxcox
Traini

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Ridge Regression for repo_glial_cells...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: 

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Lasso Regression for repo_glial_cells...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: 

  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_log


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_log


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_log


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_sqrt


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_sqrt


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_sqrt


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_boxcox


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_boxcox


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_boxcox


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_clr


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_clr


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_clr


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_deseq2


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_deseq2


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_deseq2


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_zscore


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_zscore


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_zscore


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_quantile


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_quantile


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_quantile


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_minmax


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_minmax


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_minmax


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_robust


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_robust


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_robust


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: Earth_yeojohnson


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SFug_yeojohnson


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SFug
Testing in transformed environment: SF1g_yeojohnson


  model = cd_fast.enet_coordinate_descent(


Training in raw environment: SF1g
Testing in transformed environment: Earth
Training in raw environment: SF1g
Testing in transformed environment: SFug
Training in raw environment: SF1g
Testing in transformed environment: Earth_log
Training in raw environment: SF1g
Testing in transformed environment: SFug_log
Training in raw environment: SF1g
Testing in transformed environment: SF1g_log
Training in raw environment: SF1g
Testing in transformed environment: Earth_sqrt
Training in raw environment: SF1g
Testing in transformed environment: SFug_sqrt
Training in raw environment: SF1g
Testing in transformed environment: SF1g_sqrt
Training in raw environment: SF1g
Testing in transformed environment: Earth_boxcox
Training in raw environment: SF1g
Testing in transformed environment: SFug_boxcox
Training in raw environment: SF1g
Testing in transformed environment: SF1g_boxcox
Training in raw environment: SF1g
Testing in transformed environment: Earth_clr
Training in raw environment: SF1g
Testing i

  0%|          | 0/53 [00:00<?, ?it/s]

*******************************************************
Training Support Vector Machine for repo_glial_cells...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environ

  0%|          | 0/53 [00:00<?, ?it/s]

In [None]:
results_df = pd.read_csv("mnt/MyDrive/NASA/all_model_results_n1500.csv")

pd.set_option('display.float_format', lambda x: '%.3f' % x)
results_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,Selected Features,Top 20 SHAP Values
0,ANN_Final_Test,th_positive_cells,2360.669,48.587,26.888,-0.611,,"FBgn0015774,FBgn0015776,FBgn0015777,FBgn001577...","{'FBgn0023496': 0.04475578278790186, 'FBti0063..."
1,Linear_Regression_Final_Test,th_positive_cells,1977.322,44.467,24.633,-0.349,,"FBgn0000455,FBgn0000459,FBgn0000463,FBgn000055...","{'FBgn0036833': 0.19138259979406097, 'FBgn0264..."
2,Ridge_Final_Test,th_positive_cells,1992.186,44.634,24.705,-0.359,,"FBgn0000032,FBgn0000061,FBgn0000075,FBgn000011...","{'FBgn0036833': 0.058198007816529104, 'FBgn026..."
3,Lasso_Final_Test,th_positive_cells,2025.62,45.007,25.038,-0.382,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001...","{'FBgn0267766': 0.5214049144977432, 'FBgn02679..."
4,SVM_Final_Test,th_positive_cells,2002.409,44.748,24.775,-0.366,,"FBgn0000032,FBgn0000075,FBgn0000116,FBgn000012...","{'FBgn0036833': 0.05154506209405348, 'FBgn0264..."
5,ANN_Final_Test,repo_glial_cells,50968.477,225.762,104.358,-0.281,,"FBgn0015737,FBgn0015754,FBgn0015756,FBgn001576...","{'FBgn0022700': 0.07558726538854034, 'FBti0062..."


In [None]:
# rf_regr(df, "repo_glial_cells", n_features=1500)
# results_df.to_csv("mnt/MyDrive/NASA/rf_results_n15000.csv", index=False)

# rf_regr(df, "th_positive_cells", n_features=1500)
# results_df.to_csv("mnt/MyDrive/NASA/rf_results_n15000.csv", index=False)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Training in raw environment: Earth
Testing in transformed environment: SFug
Training in raw environment: Earth
Testing in transformed environment: SF1g
Training in raw environment: Earth
Testing in transformed environment: Earth_log
Training in raw environment: Earth
Testing in transformed environment: SFug_log
Training in raw environment: Earth
Testing in transformed environment: SF1g_log
Training in raw environment: Earth
Testing in transformed environment: Earth_sqrt
Training in raw environment: Earth
Testing in transformed environment: SFug_sqrt
Training in raw environment: Earth
Testing in transformed environment: SF1g_sqrt
Training in raw environment: Earth
Testing in transformed environment: Earth_boxcox
Training in raw environment: Earth
Testing in transformed environment: SFug_boxcox
Training in raw environment: Earth
Testing in transformed environment: SF1g_boxcox
Training in raw environment: Earth
Testing in tran

  r = np.sqrt(r2)


KeyboardInterrupt: 

In [None]:
rf_regr(df, "th_positive_cells", n_features=1500)
results_df.to_csv("mnt/MyDrive/NASA/rf_results_n15000.csv", index=False)