In [1]:
import pandas as pd
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import random

In [3]:
def load_dat_file(filepath: str, delimiter: str = '\t') -> np.ndarray:
    """
    Load a .dat file into a NumPy array.

    Args:
        filepath (str): Path to the .dat file.
        delimiter (str, optional): The delimiter used in the .dat file. 
                                   Default is tab ('\\t').

    Returns:
        np.ndarray: The data from the .dat file as a NumPy array.
    """
    # loadtxt will automatically infer rows/columns based on the file
    data = np.loadtxt(filepath, delimiter=delimiter)
    return data

In [4]:
def compute_fisher_z(signals: np.ndarray) -> np.ndarray:
    """
    Given time series data of shape (n_timepoints, n_regions),
    compute the pairwise Pearson correlation among the columns (i.e., regions),
    then apply the Fisher Z-transform to those correlation values.

    Args:
        signals (np.ndarray): fMRI time-series data of shape (n_timepoints, n_regions),
                              where each column is a region, and each row is a timepoint.

    Returns:
        fisher_z_mat (np.ndarray): (n_regions, n_regions) matrix of
                                   Fisher Z-transformed connectivity.
    """
    # Step 1: Compute Pearson correlation among columns (regions)
    # rowvar=False => treat each column as a variable
    corr_mat = np.corrcoef(signals, rowvar=False)  # shape: (n_regions, n_regions)

    # Step 2: Apply Fisher Z-transform
    # Z = arctanh(r), i.e. 0.5 * ln((1+r)/(1-r))
    # We clamp r to avoid infinity at r=±1
    epsilon = 1e-8
    corr_mat = np.clip(corr_mat, -1 + epsilon, 1 - epsilon)
    fisher_z_mat = np.arctanh(corr_mat)

    return fisher_z_mat

In [5]:
def approximate_tolerance_interval(
    hc_values: np.ndarray,
    coverage: float = 0.90
) -> tuple:
    """
    Approximate a two-sided (100 * coverage)% tolerance interval
    by taking the central (coverage)% of the healthy-control values.
    Here we simply compute the lower and upper percentiles around the median.
    
    For example, coverage=0.90 yields the 5th and 95th percentiles.

    Args:
        hc_values (np.ndarray): 1D array of FC scores for HCs.
        coverage (float): Desired coverage proportion (e.g., 0.90 for 90%).

    Returns:
        (lower_bound, upper_bound) as floats.
    """
    # Sort the healthy control FC values
    sorted_vals = np.sort(hc_values)

    # We define the "central coverage" interval, e.g. 90% => cut 5% on each tail.
    lower_percentile = (1.0 - coverage)/2.0 * 100.0  # 5% if coverage=90%
    upper_percentile = (1.0 - (1.0 - coverage)/2.0) * 100.0  # 95% if coverage=90%

    lower_bound = np.percentile(sorted_vals, lower_percentile)
    upper_bound = np.percentile(sorted_vals, upper_percentile)
    return (lower_bound, upper_bound)

In [6]:
def compute_tolerance_intervals_for_matrices(
    corr_matrices: list[np.ndarray],
    coverage: float = 0.90
) -> list[list[tuple[float, float]]]:
    """
    Given a list of correlation matrices (each shape (n_regions, n_regions)),
    compute an approximate tolerance interval for each cell (i, j) across all matrices.

    Steps:
      1) Validate that the list is not empty, and all matrices have the same shape.
      2) For each (i, j) cell, gather the correlation across all matrices.
      3) Call approximate_tolerance_interval(...) => (lo, hi).
      4) Return a 2D list of shape (n_regions x n_regions),
         where each element is the tuple (low_bound, high_bound).

    Args:
        corr_matrices (List[np.ndarray]): List of NxN correlation matrices.
        coverage (float): coverage proportion, e.g. 0.9.

    Returns:
        A 2D python list of shape (n_regions x n_regions),
        where each element is the tuple (low_bound, high_bound).
    """
    if len(corr_matrices) == 0:
        raise ValueError("corr_matrices list is empty.")

    # Assume all correlation matrices share the same shape
    sample_shape = corr_matrices[0].shape
    if any(mat.shape != sample_shape for mat in corr_matrices):
        raise ValueError("Not all matrices in corr_matrices have the same shape.")

    n_regions = sample_shape[0]
    intervals = [[(0.0, 0.0) for _ in range(n_regions)] for __ in range(n_regions)]

    for i in range(n_regions):
        for j in range(n_regions):
            # gather cell (i,j) across all matrices
            cell_values = [mat[i, j] for mat in corr_matrices]
            cell_values_np = np.array(cell_values)
            lo, hi = approximate_tolerance_interval(cell_values_np, coverage=coverage)
            intervals[i][j] = (lo, hi)

    return np.array(intervals)

In [7]:

def flag_outside_range(
    subject_corr: np.ndarray, 
    range_mat: np.ndarray
) -> np.ndarray:
    """
    Given:
      - subject_corr: shape (166, 166), the subject's correlation matrix
      - range_mat: shape (166, 166, 2), where range_mat[i, j, 0] = low, range_mat[i, j, 1] = high

    Returns:
      A boolean mask of shape (166, 166). True where subject_corr value 
      is outside the [low, high] range. False otherwise.
    """
    lower_bound = range_mat[:, :, 0]  # shape (166, 166)
    upper_bound = range_mat[:, :, 1]  # shape (166, 166)

    # Create a boolean mask:
    # True means correlation is outside [low, high]
    # That is subject_corr < low OR subject_corr > high
    outside_mask = (subject_corr < lower_bound) | (subject_corr > upper_bound)

    return outside_mask

In [8]:
def repeated_enrichment_analysis(
    mdd_masks,  # list of NxN boolean arrays from diseased group
    hc_masks,   # list of NxN boolean arrays from healthy controls
    coverage=0.9,       # fraction of diseased subjects to subsample each iteration
    reps=100,            # number of repeated subsamples
    consistency_thr=0.95 # fraction of times an edge must be "selected" to be labeled "extreme"
):
    """
    Demonstration code:
      - mdd_masks: list of NxN boolean matrices (one per MDD subject) indicating 'extreme' edges
      - hc_masks:  list of NxN boolean matrices (one per HC subject)  indicating 'extreme' edges
      - coverage:  fraction of MDD subjects to sample each iteration (e.g. 0.9 => 90% subset)
      - reps:      how many random subsamples
      - consistency_thr: e.g. 0.95 => we call an edge 'extreme' if it is 'selected' in at least 95% of reps

    Returns:
      extremes_mask: NxN boolean array telling which edges are 'extreme'
      consistency_rate: NxN float array telling the fraction of runs that selected each edge
    """
    n_mdd = len(mdd_masks)
    if n_mdd == 0:
        raise ValueError("No MDD masks provided.")
    n_hc  = len(hc_masks)
    if n_hc == 0:
        raise ValueError("No HC masks provided.")

    # Check the shape of the first matrix for reference
    n, _ = mdd_masks[0].shape
    # Validate all shapes match
    for mat in mdd_masks + hc_masks:
        if mat.shape != (n, n):
            raise ValueError("All matrices must be NxN with the same N.")

    # Summation of edges in all HC => how frequently each edge is 'extreme' in HC
    # We'll do a naive check for demonstration
    hc_sum = np.zeros((n, n), dtype=int)
    for mask in hc_masks:
        hc_sum += mask.astype(int)
    # fraction in HC
    fraction_hc = hc_sum / n_hc

    # We'll keep track how many times each edge is 'selected' across the runs
    selected_counts = np.zeros((n, n), dtype=int)

    sub_size = int(round(coverage * n_mdd))  # e.g. 90% of MDD

    for _ in range(reps):
        # Randomly choose coverage% MDD
        subset_idx = random.sample(range(n_mdd), sub_size)

        # Summation of that subset
        subset_sum = np.zeros((n, n), dtype=int)
        for idx in subset_idx:
            subset_sum += mdd_masks[idx].astype(int)
        # fraction of that subset that is 'extreme' for each edge
        fraction_mdd = subset_sum / sub_size

        # Example "enrichment" rule:
        # "Select" an edge if fraction_mdd is strictly greater than fraction_hc
        # Real pipeline might do a hypergeom test or p-value threshold
        selected_this_run = (fraction_mdd > fraction_hc)

        selected_counts += selected_this_run

    consistency_rate = selected_counts / reps
    extremes_mask = (consistency_rate >= consistency_thr)

    return extremes_mask, consistency_rate

In [9]:
abide_df = pd.read_csv("./Phenotypic_V1_0b.csv", index_col=0)

In [10]:
abide_df.head()

Unnamed: 0,SITE_ID,SUB_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,HANDEDNESS_CATEGORY,HANDEDNESS_SCORES,FIQ,...,WISC_IV_BLK_DSN_SCALED,WISC_IV_PIC_CON_SCALED,WISC_IV_MATRIX_SCALED,WISC_IV_DIGIT_SPAN_SCALED,WISC_IV_LET_NUM_SCALED,WISC_IV_CODING_SCALED,WISC_IV_SYM_SCALED,EYE_STATUS_AT_SCAN,AGE_AT_MPRAGE,BMI
0,CALTECH,51456,Caltech_0051456,1,4,55.4,1,R,,126.0,...,,,,,,,,2,,
1,CALTECH,51457,Caltech_0051457,1,4,22.9,1,Ambi,,107.0,...,,,,,,,,2,,
2,CALTECH,51458,Caltech_0051458,1,1,39.2,1,R,,93.0,...,,,,,,,,2,,
3,CALTECH,51459,Caltech_0051459,1,1,22.8,1,R,,106.0,...,,,,,,,,2,,
4,CALTECH,51460,Caltech_0051460,1,1,34.6,2,Ambi,,133.0,...,,,,,,,,2,,


In [11]:
all_dat_file = glob.glob("/blue/ruogu.fang/ryoi360/projects/fmri_vlm/data/ABIDE_parcelled_yeo17/*")

In [12]:
subj_to_fmri_arr = {os.path.basename(path).replace("_MNI_2mm.dat", ""): load_dat_file(path) for path in all_dat_file}

In [13]:
abide_df["fmri_arr"] = abide_df["FILE_ID"].map(subj_to_fmri_arr)

In [14]:
abide_df["fmri_arr"].iloc[0]

array([[-1.01231891e-02, -3.46075718e+00, -3.49284756e+01, ...,
         2.19633155e+00, -4.73039578e+00,  4.28631490e+00],
       [-1.74925499e-02, -2.22421529e+00, -4.03677604e+01, ...,
         6.64559158e+00, -3.10425970e+00,  9.73879194e+00],
       [-2.09682972e-02, -5.14201400e-01, -3.27705525e+01, ...,
         5.81919052e+00, -6.05617235e-01, -1.81072118e+00],
       ...,
       [ 2.53874102e-03,  2.18767997e+00,  5.57620185e+01, ...,
         2.64041631e+01,  3.95058809e+00,  1.97457896e+01],
       [-1.27511507e-02,  1.17814194e-01,  4.82443158e+01, ...,
         2.54738583e+01,  6.02349804e+00, -1.14388751e+01],
       [-1.74437400e-02, -1.22939262e+00,  2.75592267e+01, ...,
         1.62567343e+01,  4.70006554e+00, -2.61908430e+01]],
      shape=(146, 17))

In [15]:
abide_df = abide_df.dropna(subset=["fmri_arr"])

In [16]:
abide_df["corr_matrix"] = abide_df["fmri_arr"].apply(lambda x: compute_fisher_z(x))

  c /= stddev[:, None]
  c /= stddev[None, :]


In [17]:
abide_df["corr_matrix"].iloc[0].shape

(17, 17)

In [18]:
control_df = abide_df[abide_df["DX_GROUP"] == 2].copy()

In [19]:
# Split data, allocating 30% to the test set
X_train, X_test = train_test_split(control_df, test_size=0.3, random_state=42)

In [20]:
all_tolerance_intervals = compute_tolerance_intervals_for_matrices(X_train["corr_matrix"].to_list())

In [24]:
all_tolerance_intervals

array([[[ 9.55691396e+00,  9.55691396e+00],
        [ 3.19137994e-01,  1.30347764e+00],
        [ 1.48646114e-01,  1.13626907e+00],
        [-4.26211075e-02,  9.21887195e-01],
        [ 1.11325873e-02,  1.00778669e+00],
        [-1.02369954e-01,  7.68863726e-01],
        [-1.82066776e-01,  5.59207708e-01],
        [-1.60046486e-01,  6.59321914e-01],
        [-1.36635505e-01,  5.56322148e-01],
        [-9.22196447e-02,  6.79279743e-01],
        [ 4.26453080e-03,  8.91466787e-01],
        [-4.68835066e-02,  7.67614630e-01],
        [-5.81180889e-02,  8.03281709e-01],
        [-1.52558735e-01,  6.51783899e-01],
        [-1.55492875e-01,  6.51822490e-01],
        [-1.76932845e-01,  5.48545036e-01],
        [-1.69850321e-01,  6.23972136e-01]],

       [[ 3.19137994e-01,  1.30347764e+00],
        [ 9.55691396e+00,  9.55691396e+00],
        [ 3.28636132e-01,  1.18805107e+00],
        [ 3.47057279e-01,  1.17655514e+00],
        [ 2.96090615e-01,  1.29596223e+00],
        [ 6.98673045e-03,  8.6

In [21]:
raise ValueErrors()

NameError: name 'ValueErrors' is not defined

In [None]:
abide_df["extreme_corr_mask"] = abide_df["corr_matrix"].apply(lambda x: flag_outside_range(x, all_tolerance_intervals))

In [None]:
hc_corr_mask = abide_df.loc[X_test.index]["extreme_corr_mask"].to_list()
ASD_corr_mask = abide_df[abide_df["DX_GROUP"] == 1]["extreme_corr_mask"].to_list()

In [None]:
abide_df["extreme_corr_mask"].iloc[0]

In [None]:
abide_df

In [None]:
def plot_functional_matrix(matrix, title="Functional Connectivity Matrix", colormap="coolwarm", vmin=None, vmax=None):
    """
    Visualizes a functional connectivity matrix using a heatmap with adjustable color range.

    Parameters:
    - matrix (np.ndarray): 2D NumPy array representing the connectivity matrix.
    - title (str): Title of the plot.
    - colormap (str): Matplotlib colormap for visualization.
    - vmin (float, optional): Minimum value for color scale (default is matrix min).
    - vmax (float, optional): Maximum value for color scale (default is matrix max).
    """
    plt.figure(figsize=(10, 8))
    plt.imshow(matrix, cmap=colormap, interpolation="nearest", aspect="auto", vmin=vmin, vmax=vmax)
    plt.colorbar(label="Connectivity Strength")
    plt.title(title)
    plt.xlabel("Brain Region Index")
    plt.ylabel("Brain Region Index")
    plt.xticks(range(matrix.shape[0]))
    plt.yticks(range(matrix.shape[1]))
    plt.grid(False)
    plt.show()

In [None]:
np.median(abide_df["corr_matrix"].iloc[0])

In [None]:
plot_functional_matrix(abide_df[abide_df["DX_GROUP"] == 2]["corr_matrix"].iloc[1], vmin=0, vmax=2)

In [None]:
plot_functional_matrix(abide_df[abide_df["DX_GROUP"] == 1]["corr_matrix"].iloc[1], vmin=0, vmax=2)

In [None]:
extremes_mask, consistency_rate = repeated_enrichment_analysis(ASD_corr_mask, hc_corr_mask)

In [None]:
sum(extremes_mask)

In [None]:
raise ValueError()

In [None]:
import numpy as np
from nilearn import datasets
from nilearn.connectome import ConnectivityMeasure

# Load fMRI data (replace with your actual data)
rest_dataset = datasets.fetch_atlas_harvard_oxford("cort-maxprob-thr0-1mm")
time_series = np.random.rand(100, 48) # Example: 100 time points, 48 regions

# Define the connectivity measure
connectivity_measure = ConnectivityMeasure(kind='correlation')

# Compute the connectivity matrix
correlation_matrix = connectivity_measure.fit_transform([time_series])[0]

# Print or visualize the matrix
print(correlation_matrix.shape)
print(correlation_matrix)

In [None]:
np.array(all_tolerance_intervals)

In [None]:
control_df

In [None]:
len(df["SUB_ID"].unique())

In [None]:
df["SITE_ID"].unique()

In [None]:
df["DX_GROUP"].unique()

In [None]:
df.columns

In [None]:
preprocessed_metadata_df = pd.read_csv("/blue/ruogu.fang/ryoi360/projects/fmri_vlm/results/2025_02_25_ABIDE_processing/Phenotypic_V1_0b_preprocessed1.csv")

In [None]:
for col in preprocessed_metadata_df.columns:
    print(col)

In [None]:
motion_filtered_df = preprocessed_metadata_df[(preprocessed_metadata_df['func_mean_fd'] <= 0.2) & (preprocessed_metadata_df['func_num_fd'] < 20)]

print(f"Subjects after stricter filtering: {len(motion_filtered_df)}")

In [None]:
len(preprocessed_metadata_df.columns)

In [None]:
preprocessed_metadata_df["func_mean_fd"]

In [None]:
preprocessed_metadata_df["func_fwhm"].max()

In [None]:
import glob

In [None]:
abide_files = glob.glob("/orange/ruogu.fang/ryoi360/ABIDE/*")

In [None]:
len(abide_files)

In [None]:
import numpy as np
import nibabel as nib
from nilearn import plotting
import matplotlib.pyplot as plt

In [None]:
fmri_img = nib.load(abide_files[0])
fmri_data = fmri_img.get_fdata()

In [None]:
fmri_data.shape

In [None]:
# Get the voxel size from the affine transformation matrix
voxel_size = np.sqrt(np.sum(fmri_img.affine[:3, :3] ** 2, axis=0))
print("Original Voxel Size (mm):", voxel_size)

In [None]:
1. Install Required Packages
If you haven't installed Nipype, Nibabel, and NiLearn, do so using:

bash
Copy
Edit
pip install nipype nibabel nilearn numpy scipy


In [None]:
2. Preprocessing Steps in Python
Step 1: Rigid Body Motion Correction
Use SPM's Realign function via Nipype.
Alternatively, use FSL's MCFLIRT.
SPM12 (via Nipype)
python
Copy
Edit
from nipype.interfaces.spm import Realign

realign = Realign()
realign.inputs.in_files = 'subject_func.nii'  # Replace with your file path
realign.inputs.register_to_mean = True
realign.run()
FSL Alternative
python
Copy
Edit
from nipype.interfaces.fsl import MCFLIRT

mcflirt = MCFLIRT()
mcflirt.inputs.in_file = 'subject_func.nii'
mcflirt.inputs.out_file = 'motion_corrected.nii'
mcflirt.run()
Step 2: Slice Timing Correction
Adjusts for differences in slice acquisition time.
Requires TR (repetition time) and slice order.
python
Copy
Edit
from nipype.interfaces.spm import SliceTiming

slice_timing = SliceTiming()
slice_timing.inputs.in_files = 'motion_corrected.nii'
slice_timing.inputs.time_repetition = 2.0  # Set the correct TR
slice_timing.run()
Step 3: Normalization to MNI Space
Warp the functional data into MNI152 template.
Use SPM's Normalize or FSL's FLIRT/FNIRT.
SPM Normalization
python
Copy
Edit
from nipype.interfaces.spm import Normalize12

normalize = Normalize12()
normalize.inputs.image_to_align = 'slice_time_corrected.nii'
normalize.inputs.apply_to_files = ['slice_time_corrected.nii']
normalize.inputs.jobtype = 'estwrite'  # Estimate and apply transformation
normalize.run()
FSL FLIRT Alternative
python
Copy
Edit
from nipype.interfaces.fsl import FLIRT

flirt = FLIRT()
flirt.inputs.in_file = 'slice_time_corrected.nii'
flirt.inputs.reference = '/usr/local/fsl/data/standard/MNI152_T1_2mm_brain.nii.gz'
flirt.inputs.out_file = 'normalized.nii'
flirt.run()
Step 4: Resampling to 3×3×3 mm³
Use NiLearn for resampling.
python
Copy
Edit
from nilearn.image import resample_img
import nibabel as nib

img = nib.load("normalized.nii")

resampled_img = resample_img(img, target_affine=np.diag([3, 3, 3, 1]))
nib.save(resampled_img, "resampled_3mm.nii")
Step 5: Spatial Smoothing (FWHM = 6 mm)
Apply Gaussian smoothing using NiLearn.
python
Copy
Edit
from nilearn.image import smooth_img

smoothed_img = smooth_img("resampled_3mm.nii", fwhm=6)
smoothed_img.to_filename("smoothed.nii")
