# Model Development

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB

In [22]:
df = pd.read_csv("data/datasets/augmented_data.csv")
results_df = pd.DataFrame(columns=['Model', 'Target Variable', 'Training Environment', 'Test Environment', 'MSE', 'RMSE', 'MAE', 'R2', 'R', 'Selected Features'])
df.drop(columns=['Unnamed: 0'], inplace=True)
target_columns = df.columns[-2:].to_list()

# Extract environments from source_name
df['Environment'] = df['source_name'].apply(lambda x: f"{x.split('_')[0]}_{x.split('_')[-1]}" if x.count('_') == 2 else x.split('_')[0])
environments = df['Environment'].unique()

In [23]:
target_columns

['th_positive_cells', 'repo_glial_cells']

In [24]:
target_variables = df[target_columns]
target_variables.describe()

Unnamed: 0,th_positive_cells,repo_glial_cells
count,144.0,144.0
mean,44.634,171.997
std,43.599,233.981
min,-0.579,-0.52
25%,4.562,6.12
50%,10.909,23.731
75%,89.516,398.162
max,120.545,810.811


In [25]:
drop_columns = target_columns.copy()
drop_columns.extend(['Environment', 'source_name'])
drop_columns

['th_positive_cells', 'repo_glial_cells', 'Environment', 'source_name']

In [26]:
def train_MLP_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        
        print("Training environment:", e_train)
        
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=drop_columns)
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the MLP model
        mlp = MLPRegressor(hidden_layer_sizes=(64, 32, 16), max_iter=400, random_state=42)
        mlp.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            
            print("Testing environment:", e_train)
            if (e_test == e_train) or (len(data[data['Environment'] == e_test]) == 0):
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=drop_columns)
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = mlp.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["ANN", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [27]:
def train_LR_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=drop_columns)
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Linear Regression model
        lr = LinearRegression()
        lr.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=drop_columns)
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = lr.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Linear Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]



In [28]:
def train_RF_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=drop_columns)
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Random Forest model
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=drop_columns)
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = rf.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Random Forest", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]

In [29]:
def train_ridge_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=drop_columns)
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Ridge Regression model
        model = Ridge(random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=drop_columns)
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Ridge Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [30]:
def train_lasso_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=drop_columns)
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Lasso Regression model
        model = Lasso(random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=drop_columns)
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Lasso Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]



In [31]:
def train_svm_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=drop_columns)
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Support Vector Machine model
        model = SVR()
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=drop_columns)
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Support Vector Machine", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [None]:
# Train and evaluate the models
for target in target_columns:
    print(f"Training MLP for {target}...")
    train_MLP_and_store_results(df, target, n_features=1000)
    print(f"Training Linear Regression for {target}...")
    train_LR_and_store_results(df, target, n_features=1000)
    print(f"Training Random Forest for {target}...")
    train_RF_and_store_results(df, target, n_features=1000)
    print(f"Training Ridge Regression for {target}...")
    train_ridge_and_store_results(df, target, n_features=1000)
    print(f"Training Lasso Regression for {target}...")
    train_lasso_and_store_results(df, target, n_features=1000)
    print(f"Training Support Vector Machine for {target}...")
    train_svm_and_store_results(df, target, n_features=1000)

results_csv_path = 'data/results/inv_n1000_all_model_results.csv'
results_df.to_csv(results_csv_path, index=False)

In [4]:
results_df = pd.read_csv("data/results/inv_n1000_all_model_results.csv")

pd.set_option('display.float_format', lambda x: '%.3f' % x)
results_df

Unnamed: 0,Model,Target Variable,Training Environment,Test Environment,MSE,RMSE,MAE,R2,R,Selected Features
0,ANN,th_positive_cells,Earth,SFug,43405.443,208.340,123.715,-193.004,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
1,ANN,th_positive_cells,Earth,SF1g,21393.919,146.267,109.138,-282.392,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
2,ANN,th_positive_cells,Earth,Earth_log,68128.757,261.015,260.996,-6633598.611,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
3,ANN,th_positive_cells,Earth,SFug_log,66474.059,257.826,257.817,-1663140.420,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4,ANN,th_positive_cells,Earth,SF1g_log,67241.356,259.309,259.292,-10149335.082,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
...,...,...,...,...,...,...,...,...,...,...
4585,Support Vector Machine,repo_glial_cells,SF1g_deseq2,Earth_clr,275335.172,524.724,524.724,-15095185.715,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4586,Support Vector Machine,repo_glial_cells,SF1g_deseq2,SFug_clr,275204.787,524.600,524.600,-4199119.415,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4587,Support Vector Machine,repo_glial_cells,SF1g_deseq2,SF1g_clr,275039.602,524.442,524.442,-4797961.367,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4588,Support Vector Machine,repo_glial_cells,SF1g_deseq2,Earth_deseq2,14388.573,119.952,105.977,-3.370,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."


In [None]:
sorted_results = results_df.copy().sort_values(by='R2', ascending=False)
sorted_results.head(20)

Unnamed: 0,Model,Target Variable,Training Environment,Test Environment,MSE,RMSE,MAE,R2,R,Selected Features
2397,Linear Regression,repo_glial_cells,Earth_deseq2,Earth,5.379,2.319,1.895,0.998,0.999,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
3009,Ridge Regression,repo_glial_cells,Earth_deseq2,Earth,12.162,3.487,3.131,0.996,0.998,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
3027,Ridge Regression,repo_glial_cells,SFug_deseq2,SFug,187.804,13.704,11.786,0.985,0.993,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
2415,Linear Regression,repo_glial_cells,SFug_deseq2,SFug,204.212,14.29,12.897,0.984,0.992,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
3333,Lasso Regression,repo_glial_cells,SFug_deseq2,SFug,293.907,17.144,9.976,0.977,0.989,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
3315,Lasso Regression,repo_glial_cells,Earth_deseq2,Earth,102.674,10.133,10.076,0.967,0.984,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
579,Linear Regression,th_positive_cells,SFug_deseq2,SFug,8.592,2.931,2.781,0.962,0.981,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
1191,Ridge Regression,th_positive_cells,SFug_deseq2,SFug,8.636,2.939,2.72,0.961,0.981,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
3074,Lasso Regression,repo_glial_cells,Earth,Earth_deseq2,129.283,11.37,10.982,0.961,0.98,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."
3092,Lasso Regression,repo_glial_cells,SFug,SFug_deseq2,580.654,24.097,21.546,0.957,0.978,"Unnamed: 0,FBgn0000003,FBgn0000008,FBgn0000014..."


In [7]:
# Group by Model and Target Variable, then find the instance with the lowest RMSE for each combination
best_models = results_df.loc[results_df.groupby(['Model', 'Target Variable'])['R2'].idxmax()]

# Display the best performing instances for each model and target variable
best_models = best_models[['Model', 'Target Variable', 'Training Environment', 'Test Environment', 'RMSE', 'MSE', 'MAE', 'R2', 'Selected Features']]
best_models


Unnamed: 0,Model,Target Variable,Training Environment,Test Environment,RMSE,MSE,MAE,R2,Selected Features
3045,ANN,repo_glial_cells,SF1g_deseq2,SF1g,111.563,12446.326,86.97,0.348,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
1080,ANN,th_positive_cells,Earth_boxcox,SFug_boxcox,26.151,683.863,25.278,-0.081,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4251,Lasso Regression,repo_glial_cells,SFug_deseq2,SFug,6.924,47.942,6.596,0.996,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
2397,Lasso Regression,th_positive_cells,Earth_deseq2,Earth,0.681,0.464,0.568,0.994,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
3333,Linear Regression,repo_glial_cells,SFug_deseq2,SFug,8.434,71.139,6.964,0.994,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
1497,Linear Regression,th_positive_cells,SFug_deseq2,SFug,3.217,10.35,2.873,0.954,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
3398,Random Forest,repo_glial_cells,SFug,SFug_deseq2,64.067,4104.568,48.348,0.697,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
1562,Random Forest,th_positive_cells,SFug,SFug_deseq2,7.822,61.182,6.273,0.71,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
3945,Ridge Regression,repo_glial_cells,SFug_deseq2,SFug,9.795,95.948,8.693,0.993,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
2109,Ridge Regression,th_positive_cells,SFug_deseq2,SFug,3.325,11.053,3.01,0.951,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."


In [None]:
best_models.to_csv("data/results/best_models.csv")