# Model Development

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB

In [14]:
df = pd.read_csv("data/datasets/augmented_data.csv")
results_df = pd.DataFrame(columns=['Model', 'Target Variable', 'Training Environment', 'Test Environment', 'MSE', 'RMSE', 'MAE', 'R2', 'R', 'Selected Features'])
df.drop(columns=df.columns[0], inplace=True)
target_columns = df.columns[-2:].to_list()

# Extract environments from source_name
df['Environment'] = df['source_name'].apply(lambda x: x.split('_')[0])
environments = df['Environment'].unique()

In [15]:
target_variables = df[target_columns]
target_variables.describe()

Unnamed: 0,th_positive_cells,repo_glial_cells
count,96.0,96.0
mean,42.536048,212.528303
std,44.903146,252.787051
min,-1.857618,-1.384246
25%,4.624973,6.227505
50%,10.049876,22.278411
75%,92.5,454.0
max,120.0,792.0


In [16]:
def train_MLP_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        
        print("Training environment:", e_train)
        
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the MLP model
        mlp = MLPRegressor(hidden_layer_sizes=(64, 32, 16), max_iter=400, random_state=42)
        mlp.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            
            print("Testing environment:", e_train)
            if (e_test == e_train) or (len(data[data['Environment'] == e_test]) == 0):
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = mlp.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["ANN", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [17]:
def train_LR_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Linear Regression model
        lr = LinearRegression()
        lr.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = lr.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Linear Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]



In [18]:
def train_RF_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Random Forest model
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = rf.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Random Forest", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]

In [19]:
def train_logistic_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Logistic Regression model
        model = LogisticRegression(max_iter=200, random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Logistic Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [20]:
def train_ridge_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Ridge Regression model
        model = Ridge(random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Ridge Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [21]:
def train_lasso_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Lasso Regression model
        model = Lasso(random_state=42)
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Lasso Regression", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]



In [22]:
def train_svm_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Support Vector Machine model
        model = SVR()
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Support Vector Machine", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [23]:
def train_naive_bayes_and_store_results(data, target_column, n_features=50):
    for e_train in environments:
        train_data = data[data['Environment'] == e_train]
        
        X_train = train_data.drop(columns=[target_column, 'Environment', 'source_name'])
        y_train = train_data[target_column]
        
        # Feature selection using RFE
        selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
        selector = selector.fit(X_train, y_train)
        
        # Select the important features
        X_train_selected = selector.transform(X_train)
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        
        # Build and train the Naive Bayes model
        model = GaussianNB()
        model.fit(X_train_scaled, y_train)
        
        # Evaluate the model on other environments
        for e_test in environments:
            if e_test == e_train:
                continue
            
            test_data = data[data['Environment'] == e_test]
            X_test = test_data.drop(columns=[target_column, 'Environment', 'source_name'])
            y_test = test_data[target_column]
            
            X_test_selected = selector.transform(X_test)
            X_test_scaled = scaler.transform(X_test_selected)
            
            y_pred = model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            r = np.sqrt(r2)
            
            # Get the selected feature names
            selected_features = X_train.columns[selector.support_]
            
            # Store the results in the DataFrame
            results_df.loc[len(results_df)] = ["Naive Bayes", target_column, e_train, e_test, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [31]:
# Train and evaluate the models
for target in target_columns:
    print(f"Training MLP for {target}...")
    train_MLP_and_store_results(df, target, n_features=2000)
    print(f"Training Linear Regression for {target}...")
    train_LR_and_store_results(df, target, n_features=2000)
    print(f"Training Random Forest for {target}...")
    train_RF_and_store_results(df, target, n_features=2000)
    print(f"Training Ridge Regression for {target}...")
    train_ridge_and_store_results(df, target, n_features=2000)
    print(f"Training Lasso Regression for {target}...")
    train_lasso_and_store_results(df, target, n_features=2000)
    print(f"Training Support Vector Machine for {target}...")
    train_svm_and_store_results(df, target, n_features=2000)

results_csv_path = 'data/results/inv_n2000_all_model_results.csv'
results_df.to_csv(results_csv_path, index=False)

Training MLP for th_positive_cells...
Training environment: Earth
Testing environment: Earth
Testing environment: Earth
Testing environment: Earth


  r = np.sqrt(r2)
  r = np.sqrt(r2)


Training environment: SFug


  r = np.sqrt(r2)


Testing environment: SFug
Testing environment: SFug
Testing environment: SFug
Training environment: SF1g


  r = np.sqrt(r2)
  r = np.sqrt(r2)


Testing environment: SF1g
Testing environment: SF1g
Testing environment: SF1g
Training Linear Regression for th_positive_cells...


  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)


Training Random Forest for th_positive_cells...
Training Ridge Regression for th_positive_cells...
Training Lasso Regression for th_positive_cells...
Training Support Vector Machine for th_positive_cells...


  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)


Training MLP for repo_glial_cells...
Training environment: Earth


  r = np.sqrt(r2)


Testing environment: Earth
Testing environment: Earth
Testing environment: Earth


  r = np.sqrt(r2)


Training environment: SFug




Testing environment: SFug
Testing environment: SFug
Testing environment: SFug


  r = np.sqrt(r2)


Training environment: SF1g




Testing environment: SF1g
Testing environment: SF1g


  r = np.sqrt(r2)


Testing environment: SF1g
Training Linear Regression for repo_glial_cells...


  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)


Training Random Forest for repo_glial_cells...
Training Ridge Regression for repo_glial_cells...
Training Lasso Regression for repo_glial_cells...
Training Support Vector Machine for repo_glial_cells...


  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)
  r = np.sqrt(r2)


In [32]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
results_df

Unnamed: 0,Model,Target Variable,Training Environment,Test Environment,MSE,RMSE,MAE,R2,R,Selected Features
0,ANN,th_positive_cells,Earth,SFug,754070.086,868.372,560.952,-520.191,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
1,ANN,th_positive_cells,Earth,SF1g,3347253.478,1829.550,938.194,-1294.885,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
2,ANN,th_positive_cells,SFug,Earth,950.893,30.837,22.065,0.349,0.591,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
3,ANN,th_positive_cells,SFug,SF1g,840163.587,916.604,351.837,-324.268,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4,ANN,th_positive_cells,SF1g,Earth,2926.902,54.101,40.794,-1.002,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
...,...,...,...,...,...,...,...,...,...,...
157,Support Vector Machine,repo_glial_cells,Earth,SF1g,156515.832,395.621,275.312,-0.901,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
158,Support Vector Machine,repo_glial_cells,SFug,Earth,32723.069,180.895,171.268,-0.057,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
159,Support Vector Machine,repo_glial_cells,SFug,SF1g,100230.258,316.592,269.745,-0.217,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
160,Support Vector Machine,repo_glial_cells,SF1g,Earth,40794.009,201.975,199.883,-0.317,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."


In [34]:
sorted_results = results_df.copy().sort_values(by='MAE', ascending=True)
sorted_results

Unnamed: 0,Model,Target Variable,Training Environment,Test Environment,MSE,RMSE,MAE,R2,R,Selected Features
14,Random Forest,th_positive_cells,SFug,Earth,56.499,7.517,4.440,0.961,0.980,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
68,Random Forest,th_positive_cells,SFug,Earth,57.921,7.611,4.473,0.960,0.980,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
50,Random Forest,th_positive_cells,SFug,Earth,58.743,7.664,4.516,0.960,0.980,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
104,Random Forest,th_positive_cells,SFug,Earth,59.051,7.684,4.521,0.960,0.980,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
70,Random Forest,th_positive_cells,SF1g,Earth,89.698,9.471,5.510,0.939,0.969,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
...,...,...,...,...,...,...,...,...,...,...
133,Linear Regression,repo_glial_cells,Earth,SF1g,186723880669913097339910775373824.000,13664694678986176.000,5828179724546284.000,-2267510278425396538997997568.000,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
135,Linear Regression,repo_glial_cells,SFug,SF1g,79796522345756902761268619771904.000,8932889921282860.000,6952820317595849.000,-969021391117437641521364992.000,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
136,Linear Regression,repo_glial_cells,SF1g,Earth,66553810200900265479266608611328.000,8158051863092087.000,7350043296616467.000,-2149084290610453008156196864.000,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
137,Linear Regression,repo_glial_cells,SF1g,SFug,274875405350486338449009274781696.000,16579366856140386.000,9790804230335650.000,-4621563404993952945488789504.000,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
