# Model Development

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [5]:
df = pd.read_csv("filtered_data.csv")
results_df = pd.DataFrame(columns=['Model', 'Target Variable', 'MSE', 'RMSE', 'MAE', 'R2', 'R', 'Selected Features'])
df.drop(columns=df.columns[0], inplace=True)
target_columns = df.columns[-2:].to_list()
target_columns

['th_positive_cells', 'repo_glial_cells']

In [6]:
target_variables = df[target_columns]
target_variables.describe()

Unnamed: 0,th_positive_cells,repo_glial_cells
count,24.0,24.0
mean,93.330128,479.387401
std,16.666919,126.026996
min,51.0,276.0
25%,83.75,397.75
50%,95.5,455.0
75%,103.5,537.25
max,120.0,792.0


In [7]:
def train_MLP_and_store_results(data, target_column, n_features=50):
    # Prepare the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature selection using RFE
    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_train, y_train)
    
    # Select the important features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    # Build and train the MLP model
    mlp = MLPRegressor(hidden_layer_sizes=(64, 32, 16), max_iter=200, random_state=42)
    mlp.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred = mlp.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)
    
    # Get the selected feature names
    selected_features = X.columns[selector.support_]
    
    # Store the results in the DataFrame
    results_df.loc[len(results_df)] = ["ANN", target_column, mse, rmse, mae, r2, r, ','.join(selected_features)]

In [8]:
def train_LR_and_store_results(data, target_column, n_features=50):
    # Prepare the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature selection using RFE
    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_train, y_train)
    
    # Select the important features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    # Build and train the Linear Regression model
    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred = lr.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)
    
    # Get the selected feature names
    selected_features = X.columns[selector.support_]
    
    # Store the results in the DataFrame
    results_df.loc[len(results_df)] = ["Linear Regression", target_column, mse, rmse, mae, r2, r, ','.join(selected_features)]


In [9]:
# Function to train and evaluate a Random Forest model
def train_RF_and_store_results(data, target_column, n_features=50):
    # Prepare the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature selection using RFE
    selector = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=n_features, step=10)
    selector = selector.fit(X_train, y_train)
    
    # Select the important features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)
    X_test_scaled = scaler.transform(X_test_selected)
    
    # Build and train the Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    
    # Evaluate the model
    y_pred = rf.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r = np.sqrt(r2)
    
    # Get the selected feature names
    selected_features = X.columns[selector.support_]
    
    # Store the results in the DataFrame
    results_df.loc[len(results_df)] = ["Random Forest", target_column, mse, rmse, mae, r2, r, ','.join(selected_features)]

In [12]:
# Train and evaluate the models
for target in target_columns:
    print(f"Training MLP for {target}...")
    train_MLP_and_store_results(df, target, n_features=5000)
    print(f"Training Linear Regression for {target}...")
    train_LR_and_store_results(df, target, n_features=5000)
    print(f"Training Random Forest for {target}...")
    train_RF_and_store_results(df, target, n_features=5000)
    
results_csv_path = 'n5000_model_results.csv'
results_df.to_csv(results_csv_path, index=False)

Training MLP for th_positive_cells...


  r = np.sqrt(r2)


Training Linear Regression for th_positive_cells...


  r = np.sqrt(r2)


Training Random Forest for th_positive_cells...
Training MLP for repo_glial_cells...


  r = np.sqrt(r2)


Training Linear Regression for repo_glial_cells...


  r = np.sqrt(r2)


Training Random Forest for repo_glial_cells...


  r = np.sqrt(r2)


In [11]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
results_df

Unnamed: 0,Model,Target Variable,MSE,RMSE,MAE,R2,R,Selected Features
0,ANN,th_positive_cells,1062.492,32.596,30.189,-7.926,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
1,Linear Regression,th_positive_cells,5557.021,74.545,39.357,-45.682,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
2,Random Forest,th_positive_cells,144.416,12.017,9.704,-0.213,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
3,ANN,repo_glial_cells,32987.003,181.623,145.48,-2.359,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
4,Linear Regression,repo_glial_cells,231656.042,481.307,353.874,-22.588,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
5,Random Forest,repo_glial_cells,18640.642,136.531,112.6,-0.898,,"FBgn0000003,FBgn0000008,FBgn0000014,FBgn000001..."
