# Working Memory Demand and Architecture Model

This is the model created by Montbretia Cabinet team during the 2024-2025 Neuromatch Impact Scholars Program.

## Setup and dependencies

In [1]:
!pip install nilearn --quiet
!pip install graphviz --quiet
!pip install visualkeras --quiet

import os
import re
import tarfile
import requests
import visualkeras
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from glob import glob
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import BinaryCrossentropy

from tensorflow.keras.metrics import BinaryAccuracy

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.regularizers import l2

## Parameters and Data Download
The data used for the preliminary model was shared by Neuromatch in the [Project Booklets](https://compneuro.neuromatch.io/projects/fMRI/README.html#:~:text=5%2D23%2C%202021-,HCP%20task%20datasets,-%23) and most of our data preparation is similar to what they have shared in the ```load_hcp_task_with_behaviour.ipynb``` in the [HCP 2021 + behavior](https://compneuro.neuromatch.io/projects/fMRI/README.html#:~:text=View-,HCP%202021%20%2B%20behavior,-HCP%202021) section.

Our target experiments (```TargetExperiments``` variable below) are Working Memory, Emotion, and Language tasks.

In [3]:
N_SUBJECTS = 100
N_PARCELS  = 360 # Data aggregated into ROIs from Glasser parcellation
TR = 0.72  # Time resolution, in seconds
HEMIS  = ["Right", "Left"]
RUNS   = ["LR","RL"]
N_RUNS = 2

EXPERIMENTS = {
    "MOTOR"      : {"cond" : ["lf", "rf" ,"lh", "rh", "t", "cue"]},
    "WM"         : {"cond" : ["0bk_body", "0bk_faces", "0bk_places", "0bk_tools",
                              "2bk_body", "2bk_faces", "2bk_places", "2bk_tools"]},
    "SOCIAL"     : {"cond" : ["ment", "rnd"]},
    "GAMBLING"   : {"cond" : ["loss", "win"]},
    "EMOTION"    : {"cond" : ["fear", "neut"]},
    "LANGUAGE"   : {"cond" : ["math", "story"]},
    "RELATIONAL" : {"cond" : ["match", "relation"]}
}

TargetExperiments = ["WM", "EMOTION", "LANGUAGE"]

TargetConditions  = ["0bk_body", "0bk_faces", "0bk_places", "0bk_tools",
                     "2bk_body", "2bk_faces", "2bk_places", "2bk_tools",
                     "fear"    , "neut"     , "math"      , "story"    ]

In [4]:
fname = "hcp_task.tgz"
url   = "https://osf.io/2y3fw/download"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("Download FAILED: Connection Error!")
  else:
    if r.status_code != requests.codes.ok:
      print("Download FAILED!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)


HCP_DIR = "./hcp_task"

with tarfile.open(fname) as tfile:
  tfile.extractall('.')

  tfile.extractall('.')


In [5]:
SubjectIDs = np.loadtxt(os.path.join(HCP_DIR, 'subjects_list.txt'), dtype='str')
SubjectIDs = list(SubjectIDs)

### ```regions.npy``` file, parcels, and subnetworks
(Insert doc about what the regions file is)

In [6]:
regions = np.load(f"{HCP_DIR}/regions.npy").T

region_info = dict(name    = regions[0].tolist(),
                   network = regions[1],
                   hemi    = ["Right"]*int(N_PARCELS/2) + ["Left"]*int(N_PARCELS/2)
                   )

In [7]:
ventral_attention_parcels    = [121, 134]

orbital_affective_parcels    = [111, 165, 289, 291, 345]

dorsal_attention_parcels     = [26, 139, 140, 206, 207, 320]

limbic_parcels               = [109, 111, 165, 289, 291, 345]

auditory_parcels             = [23, 102, 103, 123, 172, 173, 
                                174, 282, 286, 287, 288, 303, 352, 353, 354]

default_mode_parcels         = [11, 24, 25, 27, 
                                73, 74, 78, 80, 122, 124, 127, 128, 138, 171, 
                                191, 205, 254, 258, 302, 304, 308, 318, 351]

language_parcels             = [10, 45, 49, 94, 
                                95, 115, 126, 135, 136, 142, 145, 225, 229, 
                                274, 275, 295, 296, 306, 315, 316, 322, 325]

frontoparietal_parcels       = [13, 14, 
                                28, 62, 72, 76, 79, 81, 82, 84, 96, 97, 110, 
                                132, 143, 144, 148, 169, 170, 208, 242, 252,
                                256, 259, 260, 262, 264, 276, 277, 290, 298]

somatomotor_parcels          = [7, 8, 35, 38, 39, 40, 41, 46, 
                                50, 51, 52, 53, 54, 55, 99, 100, 101, 167, 
                                187, 188, 215, 218, 219, 220, 221, 226, 230, 
                                231, 232, 233, 234, 235, 279, 280, 281, 347]

cingulo_opercular_parcels    = [9, 36, 37, 42, 43, 44, 56, 57, 58, 59, 98, 
                                104, 105, 107, 112, 113, 114, 116, 178, 179, 
                                189, 190, 204, 216, 217, 222, 223, 224, 236, 
                                237, 238, 239, 257, 261, 263, 265, 275, 277, 
                                285, 292, 293, 346, 348, 357, 358, 359]

visual_parcels               = [0, 1, 2, 3, 4, 5, 6,
                                12, 15, 16, 17, 18, 19, 20, 21, 22, 47, 48,
                                137, 141, 151, 152, 153, 155, 156, 157, 158, 
                                159, 162, 186, 192, 195, 196, 197, 198, 199, 
                                200, 201, 202, 227, 228, 317, 321, 331, 332, 
                                333, 335, 336, 337, 338, 339, 342]

posterior_multimodal_parcels = [29, 30, 31, 32, 33, 34, 60, 61, 63, 64, 65, 
                                66, 67, 68, 69, 70, 71, 75, 86, 87, 88, 89, 
                                117, 118, 119, 130, 131, 133, 154, 160, 161, 
                                163, 164, 175, 176, 177, 180, 181, 182, 183, 
                                184, 185, 192, 193, 194, 195, 196, 197, 198, 
                                199, 200, 201, 202, 214, 215, 216, 217, 218, 
                                219, 220, 221, 230, 231, 232, 233, 234, 235, 
                                246, 247, 248, 249, 250, 251, 252, 253, 255, 
                                266, 267, 269, 270, 271, 272, 273, 276, 277, 
                                278, 279, 280, 281, 283, 284, 297, 299, 300, 
                                301, 305, 307, 309, 310, 311, 312, 313, 314, 
                                319, 323, 324, 326, 327, 328, 329, 330, 341, 
                                344, 349, 350]

# Dictionary of subnetworks with no. of parcels and the list of corresponding parcels.
subnetworks = {
    f"visual_nw_{len(visual_parcels)}"                             : visual_parcels             ,
    f"limbic_nw_{len(limbic_parcels)}"                             : limbic_parcels             ,
    f"auditory_nw_{len(auditory_parcels)}"                         : auditory_parcels           ,
    f"language_nw_{len(language_parcels)}"                         : language_parcels           ,
    f"somatomotor_nw_{len(somatomotor_parcels)}"                   : somatomotor_parcels        ,
    f"default_mode_nw_{len(default_mode_parcels)}"                 : default_mode_parcels       ,
    f"frontoparietal_nw_{len(frontoparietal_parcels)}"             : frontoparietal_parcels     ,
    f"dorsal_attention_nw_{len(dorsal_attention_parcels)}"         : dorsal_attention_parcels   ,
    f"cingulo_opercular_nw_{len(cingulo_opercular_parcels)}"       : cingulo_opercular_parcels  ,
    f"orbital_affective_nw_{len(orbital_affective_parcels)}"       : orbital_affective_parcels  ,
    f"ventral_attention_nw_{len(ventral_attention_parcels)}"       : ventral_attention_parcels  ,
    f"posterior_multimodal_nw_{len(posterior_multimodal_parcels)}" : posterior_multimodal_parcels
}

## Preparing data for the model
Here, we create dataframes that contain the data relative to subjects and ROIs (parcels).

In the preliminary model, this datapoints are the average BOLD signals for each parcel.

In this model, the datapoints are timeseries of BOLD signals that will be stored in an array for the model to use.

### Helper function related to creating the dataframes

In [8]:
def load_single_timeseries(subject, experiment, run, remove_mean=True):
    """Load timeseries data for a single subject and single run.

    Arguments:
        subject (str):      subject ID to load
        experiment (str):   Name of experiment
        run (int):          (0 or 1)
        remove_mean (bool): If True, subtract the parcel-wise mean
                            (typically the mean BOLD signal is not of interest)

    Returns
        ts (n_parcel x n_timepoint array): Array of BOLD data values
    
    """
    bold_run  = RUNS[run]
    bold_path = f"{HCP_DIR}/subjects/{subject}/{experiment}/tfMRI_{experiment}_{bold_run}"
    bold_file = "data.npy"
    ts_path   = f"{bold_path}/{bold_file}"
    
    if not os.path.exists(ts_path):
        raise FileNotFoundError(f"Timeseries file not found: {ts_path}")
    ts = np.load(ts_path)
    
    if remove_mean:
        ts = ts - ts.mean(axis=1, keepdims=True)
    return ts

In [9]:
def load_evs(subject, experiment, run): # This function isn't used in this model.
    """Load EVs (explanatory variables) data for one task experiment.

    Arguments:
        subject (str): subject ID to load
        experiment (str): Name of experiment
        run (int): 0 or 1

    Returns:
        evs (list of lists): A list of frames associated with each condition
    
    """
    frames_list = []
    task_key = f"tfMRI_{experiment}_{RUNS[run]}"
    for cond in EXPERIMENTS[experiment]["cond"]:
        ev_file  = f"{HCP_DIR}/subjects/{subject}/{experiment}/{task_key}/EVs/{cond}.txt"
        ev_array = np.loadtxt(ev_file, ndmin=2, unpack=True)
        ev       = dict(zip(["onset", "duration", "amplitude"], ev_array))
        
        # Determine when trial starts, rounded down
        start = np.floor(ev["onset"] / TR).astype(int)
        # Use trial duration to determine how many frames to include for trial
        duration = np.ceil(ev["duration"] / TR).astype(int)
        # Take the range of frames that correspond to this specific trial
        frames = [s + np.arange(0, d) for s, d in zip(start, duration)]
        frames_list.append(frames)

    return frames_list


def load_evs_as_dict(subject, experiment, run):
    """Load EVs (explanatory variables) data for one task experiment.

    Arguments:
        subject (str): subject ID to load
        experiment (str): Name of experiment
        run (int): 0 or 1

    Returns:
        evs (dict): A dictionary of the data associated with each condition
    
    """
    evs = {}
    task_key = f"tfMRI_{experiment}_{RUNS[run]}"

    for cond  in EXPERIMENTS[experiment]["cond"]:
        ev_file = f"{HCP_DIR}/subjects/{subject}/{experiment}/{task_key}/EVs/{cond}.txt"
        if not os.path.exists(ev_file):
            raise FileNotFoundError(f"EV file not found: {ev_file}")
        ev_array  = np.loadtxt(ev_file, ndmin=2, unpack=True)
        evs[cond] = dict(zip(["onset", "duration", "amplitude"], ev_array))

    return evs

In [10]:
def create_dataframe(subject, experiment):
    """
    Creates a dataframe that contains the parcel-based 
    BOLD signals from a subject for each condition.

    Arguments:
        subject (str): subject ID to load
        experiment (str): Name of experiment

    Returns:
        A dataframe of parcel-based BOLD data
        for one subject and one experiment
        
    """
    all_data = []

    for run in range(2): # Run can be 0 (LR) or 1 (RL)
        try:
            ts  = load_single_timeseries(subject, experiment, run)
            evs = load_evs_as_dict(subject, experiment, run)
        except FileNotFoundError as e:
            print(e)
            continue

        n_parcels, n_timepoints = ts.shape

        for condition, ev_data in evs.items():
            onset_times = ev_data["onset"]
            durations   = ev_data["duration"]
            amplitudes  = ev_data["amplitude"]

            for onset, duration, amplitude in zip(onset_times, durations, amplitudes):
                start_frame = int(onset / TR)
                end_frame   = start_frame + int(duration / TR)

                for time_point in range(start_frame, end_frame):
                    if time_point < n_timepoints: # Ensure it is within bounds
                        row = {
                            "subject"      : subject   ,
                            "experiment"   : experiment,
                            "run"          : RUNS[run] ,
                            "condition"    : condition ,
                            "timepoint"    : time_point,
                            "EV_onset"     : onset     ,
                            "EV_duration"  : duration  ,
                            "EV_amplitude" : amplitude
                        }
                        # Add BOLD signal data for all parcels
                        row.update({f"parcel_{i + 1}": ts[i, time_point] for i in range(n_parcels)})
                        all_data.append(row)

    df = pd.DataFrame(all_data)
    return df

In [11]:
def save_to_csv(df, output_folder, filename):
    """Saves the input dataframe as a csv in
    output_folder of working directory.

    Arguments:
        df      (dataframe)
        output_folder (str)
        filename      (str)
    """
    file_path = os.path.join(output_folder, filename)
    df.to_csv(file_path, index=False)

In [12]:
def process_subject(subject, experiments, output_folder):
    """
    ????????
    Works with create_dataframe() and save_tocsv() functions.
    
    Arguments:

    Returns:
        List of dataframes.
    """
    all_dfs = []

    for experiment in experiments:
        df = create_dataframe(subject, experiment)
        if not df.empty:
            all_dfs.append(df)
        else:
            print(f"No data to save for subject {subject}, experiment {experiment}.")

        # Concatenate all dataframes row-wise
        if all_dfs:
            final_df = pd.concat(all_dfs, axis = 0)
            save_to_csv(final_df, output_folder, f"{subject}_data.csv")
        else:
            print(f"No data to save for subject {subject}.")

    return 

### Load the timeseries and isolate trials for WM, Emotion, and Language

#### Create dataframes of all trials for each subject

In [13]:
output_folder = "./output_csv_files"
os.makedirs(output_folder, exist_ok = True)

for subject in SubjectIDs:
    process_subject(subject, TargetExperiments, output_folder)

#### Create dataframe of all trials and all subjects for WM, Emotion, and Language

In [21]:
output_files = os.listdir(output_folder)
output_CSVs  = [file for file in output_files if file.endswith(".csv")]

all_trials_df = []
for file in output_CSVs:
    file_path = os.path.join(output_folder, file)
    df = pd.read_csv(file_path)
    all_trials_df.append(df)

all_trials_merged_df = pd.concat(all_trials_df, ignore_index = True)

all_trials_merged_df.to_csv("all_trials_merged_df.csv", index = False)

#### Tweak dataframe before creating the Numpy array

Here's the identifier column naming template for the dataframe that will be turned into the Numpy array:
- Identifier column (24 possibilities): ```experiment_run_condition```
    - 2 runs of the 8 subtasks of Working Memory
    - 2 runs of the 2 subtasks of Emotion
    - 2 runs of the 2 subtasks of Language
- Subject (100 subjects)
- Parcels (360 parcels)
- Timeseries

##### Merge experiment, run, and condition in ```experiment_run_condition``` column

In [27]:
all_trials_merged_df["experiment_run_condition"] = (all_trials_merged_df["experiment"] + "_" +
                                                    all_trials_merged_df["run"]        + "_" +
                                                    all_trials_merged_df["condition"])

### Create array of ```Subtask``` x ```Subject``` x ```Parcel``` x ```Timeseries``` for each experiment

Array dimensions:

**Working Memory Numpy array → 16 x 100 x 360 x ???**

*16 (2 runs of 8 subtasks) x 100 (subjects) x 360 (parcels) x ??? (timesries enteries)*

**Emotion Numpy array → 4 x 100 x 360 x ???**

*4 (2 runs of 2 subtasks) x 100 (subjects) x 360 (parcels) x ??? (timesries enteries)*

**Language Numpy array → 4 x 100 x 360 x ???**

*4 (2 runs of 2 subtasks) x 100 (subjects) x 360 (parcels) x ??? (timesries enteries)*

In [None]:
# To check for the number of timeseries enteries,
# we will calculate the shape of dataframes for 
# dataframes created per subject, not the merged dataframe.

#### Working Memory Numpy Array

#### Emotion Numpy Array

#### Language Numpy Array

### Calculate AUCs (above x-axis and below x-axis)

### Update array with AUC numbers