# Model Development

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [36]:
df = pd.read_csv("filtered_data.csv")
results_df = pd.DataFrame(columns=['Model', 'Target Variable', 'MSE', 'RMSE', 'MAE', 'R2', 'R', 'Selected Features'])
df.drop(columns=df.columns[0], inplace=True)
target_columns = df.columns[-2:].to_list()
target_columns

['th_positive_cells', 'repo_glial_cells']

In [37]:
target_variables = df[target_columns]
target_variables.describe()

Unnamed: 0,th_positive_cells,repo_glial_cells
count,24.0,24.0
mean,93.33,479.387
std,16.667,126.027
min,51.0,276.0
25%,83.75,397.75
50%,95.5,455.0
75%,103.5,537.25
max,120.0,792.0


In [28]:
def train_MLP_and_store_results(data, target_column, n_features=50):
    # Prepare the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature selection using RFE
    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_train, y_train)
    
    # Select the important features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    # Build and train the MLP model
    mlp = MLPRegressor(hidden_layer_sizes=(64, 32, 16), max_iter=200, random_state=42)
    mlp.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred = mlp.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)
    
    # Get the selected feature names
    selected_features = X.columns[selector.support_]
    
    # Store the results in the DataFrame
    results_df.loc[len(results_df)] = ["ANN", target_column, mse, rmse, mae, r2, r, ','.join(selected_features)]

In [29]:
def train_LR_and_store_results(data, target_column, n_features=50):
    # Prepare the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature selection using RFE
    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_train, y_train)
    
    # Select the important features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    # Build and train the Linear Regression model
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred = lr.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)
    
    # Get the selected feature names
    selected_features = X.columns[selector.support_]
    
    # Store the results in the DataFrame
    results_df.loc[len(results_df)] = ["Linear Regression", target_column, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [30]:
# Function to train and evaluate a Random Forest model
def train_RF_and_store_results(data, target_column, n_features=50):
    # Prepare the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature selection using RFE
    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_train, y_train)
    
    # Select the important features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    # Build and train the Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred = rf.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)
    
    # Get the selected feature names
    selected_features = X.columns[selector.support_]
    
    # Store the results in the DataFrame
    results_df.loc[len(results_df)] = ["Random Forest", target_column, mse, rmse, mae, r2, r, ','.join(selected_features)]

In [38]:
# Train and evaluate the models
for target in target_columns:
    print(f"Training MLP for {target}...")
    train_MLP_and_store_results(df, target)
    print(f"Training Linear Regression for {target}...")
    train_LR_and_store_results(df, target)
    print(f"Training Random Forest for {target}...")
    train_RF_and_store_results(df, target)
    
results_csv_path = 'model_results.csv'
results_df.to_csv(results_csv_path, index=False)

Training MLP for th_positive_cells...


  r = np.sqrt(r2)


Training Linear Regression for th_positive_cells...


  r = np.sqrt(r2)


Training Random Forest for th_positive_cells...


  r = np.sqrt(r2)


Training MLP for repo_glial_cells...


  r = np.sqrt(r2)


Training Linear Regression for repo_glial_cells...


  r = np.sqrt(r2)


Training Random Forest for repo_glial_cells...


  r = np.sqrt(r2)


In [39]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
results_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,Selected Features
0,ANN,th_positive_cells,73849.705,271.753,139.293,-619.377,,"FBgn0000008,FBgn0000015,FBgn0000032,FBgn000003..."
1,Linear Regression,th_positive_cells,54133.266,232.666,115.032,-453.749,,"FBgn0000008,FBgn0000015,FBgn0000018,FBgn000002..."
2,Random Forest,th_positive_cells,315.897,17.773,12.939,-1.654,,"FBgn0000017,FBgn0000028,FBgn0000032,FBgn000003..."
3,ANN,repo_glial_cells,204312.137,452.009,330.045,-19.804,,"FBgn0000014,FBgn0000036,FBgn0000037,FBgn000004..."
4,Linear Regression,repo_glial_cells,111753.708,334.296,305.097,-10.379,,"FBgn0000003,FBgn0000024,FBgn0000032,FBgn000003..."
5,Random Forest,repo_glial_cells,20581.472,143.462,126.95,-1.096,,"FBgn0000017,FBgn0000042,FBgn0000044,FBgn000004..."
