From d155d71ec705afd9f743a803a922ca1f7d49ade5 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 26 Jan 2023 16:13:52 -0500 Subject: [PATCH 01/22] Added pca script --- scripts/scil_compute_pca.py | 281 ++++++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 scripts/scil_compute_pca.py diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py new file mode 100644 index 000000000..b42bd74a1 --- /dev/null +++ b/scripts/scil_compute_pca.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Script to compute PCA analysis on diffusion metrics. Output returned is all significant principal components +(e.g. presenting eigenvalues > 1) in a connectivity matrix format. This script can take into account all +edges from every subject in a population or only non-zero edges across all subjects. + +Designed to work best with a Connectoflow output regardless of the atlas used for segmentation. If the data +isn't a connectoflow output, please create an input structure like this : + ${input}/${subid}/Compute_Connectivity/$metric[1] + /$metric[2] + /... + /${subid}/Compute_Connectivity/$metric[1] + /$metric[2] + /... + /... + +Output connectivity matrix will be saved next to the other metrics in the input folder. The graphs and tables +will be outputted in the designed folder in the argument. + +EXAMPLE USAGE: +dimension_reduction.py input_folder/ output_folder/ --metrics ad fa md rd afd_fixel afd_total nufo + --ids list_ids.txt --common true +""" + +# Import required libraries. +import argparse +import logging +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt +from scilpy.io.utils import (load_matrix_in_any_format, + save_matrix_in_any_format, + add_verbose_arg, + add_overwrite_arg, + assert_inputs_exist, + assert_output_dirs_exist_and_empty) + + +EPILOG = """ +[1] Chamberland M, Raven EP, Genc S, Duffy K, Descoteaux M, Parker GD, Tax CMW, Jones DK. Dimensionality + reduction of diffusion MRI measures for improved tractometry of the human brain. Neuroimage. 2019 Oct + 15;200:89-100. doi: 10.1016/j.neuroimage.2019.06.020. Epub 2019 Jun 20. PMID: 31228638; PMCID: PMC6711466. + """ + + +# Build argument parser. +def _build_arg_parser(): + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter, + epilog=EPILOG) + + p.add_argument('in_folder', + help='Path to the Connectoflow output folder.') + p.add_argument('output', + help='Path to the output folder to export graphs and tables. \n' + '*** Please note, PC connectivity matrix will be outputted in the original input folder' + 'next to all other metrics ***') + p.add_argument('--metrics', nargs='+', + help='List of all metrics to include in PCA analysis.') + p.add_argument('--ids', + help="Txt file containing all subject's id.") + p.add_argument('--common', choices=['true', 'false'], default='true', type=str, + help='If true, will include only connections found in all subjects of the population (Recommended) ' + '[True].') + + add_verbose_arg(p) + add_overwrite_arg(p) + + return p + + +def creating_pca_input(d): + """ + Script to create PCA input from matrix. + :param d: Dictionary containing all matrix for all metrics. + :return: Numpy array. + """ + for a in d.keys(): + mat = np.rollaxis(np.array(d[f'{a}']), axis=1, start=3) + matrix_shape = mat.shape[1:3] + n_group = mat.shape[0] + mat = mat.reshape((np.prod(matrix_shape) * n_group, 1)) + d[f'{a}'] = mat + + out = np.hstack([d[f'{i}'] for i in d.keys()]) + + return out + + +def restore_zero_values(orig, new): + """ + Script to restore 0 values in a numpy array. + :param orig: Original numpy array containing 0 values to restore. + :param new: Array in which 0 values need to be restored. + :return: Numpy array with data from the new array but zeros from the original array. + """ + mask = np.copy(orig) + mask[mask != 0] = 1 + + return np.multiply(new, mask) + + +def autolabel(rects, axs): + """ + Attach a text label above each bar displaying its height (or bar value). + :param rects: Graphical object. + :param axs: Axe number. + :return: + """ + for rect in rects: + height = rect.get_height() + axs.text(rect.get_x() + rect.get_width()/2., height*1.05, + '%.3f' % float(height), ha='center', va='bottom') + + +def extracting_common_cnx(d, ind): + """ + Function to create a binary mask representing common connections across the population + containing non-zero values. + :param d: Dictionary containing all matrices for all metrics. + :param ind: Indice of the key to use to generate the binary mask from the dictionary. + :return: Binary mask. + """ + keys = list(d.keys()) + met = np.copy(d[keys[ind]]) + + # Replacing all non-zero values by 1. + for i in range(0, len(met)): + met[i][met[i] != 0] = 1 + + # Adding all matrices together. + orig = np.copy(met[0]) + mask = [orig] + for i in range(1, len(met)): + orig = np.add(orig, met[i]) + mask.append(orig) + + # Converting resulting values to 0 and 1. + mask_f = mask[(len(met) - 1)] + mask_f[mask_f != len(met)] = 0 + mask_f[mask_f == len(met)] = 1 + + return mask_f + + +def apply_binary_mask(d, mask): + """ + Function to apply a binary mask to all matrices contained in a dictionary. + :param d: Dictionary of all matrices from all metrics. + :param mask: Binary mask. + :return: Dictionary with the same shape as input. + """ + for a in d.keys(): + for i in range(0, len(d[f'{a}'])): + d[f'{a}'][i] = np.multiply(d[f'{a}'][i], mask) + + return d + + +def main(): + parser = _build_arg_parser() + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + + assert_inputs_exist(parser, args.input) + assert_output_dirs_exist_and_empty(parser, args, args.output) + + subjects = open(args.ids).read().split() + + # Loading all matrix. + logging.debug('Loading all matrices...') + d = {f'{m}': [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') for a in subjects] + for m in args.metrics} + + # Setting individual matrix shape. + mat_shape = d[f'{args.metrics[0]}'][0].shape + + if args.common == 'true': + m1 = extracting_common_cnx(d, 0) + m2 = extracting_common_cnx(d, 1) + + if m1.shape != mat_shape: + parser.error("Extracted binary mask doesn't match the shape of individual input matrix.") + + if np.sum(m1) != np.sum(m2): + parser.error("Different binary mask have been generated from 2 different metrics, \n " + "please validate input data.") + else: + logging.debug('Data shows {} common connections across the population.', format(len(m1))) + + d = apply_binary_mask(d, m1) + + # Creating input structure. + logging.debug('Creating PCA input structure...') + df = creating_pca_input(d) + df[df == 0] = 'nan' + + # Standardized the data. + logging.debug('Standardizing data...') + x = StandardScaler().fit_transform(df) + df_pca = x[~np.isnan(x).any(axis=1)] + x = np.nan_to_num(x, nan=0.) + + # Perform the PCA. + logging.debug('Performing PCA...') + pca = PCA(n_components=len(args.metrics)) + principalComponents = pca.fit_transform(df_pca) + principalDf = pd.DataFrame(data=principalComponents, columns=[f'PC{i}' for i in range(1, len(args.metrics) + 1)]) + + # Plot the eigenvalues. + logging.debug('Plotting results...') + eigenvalues = pca.explained_variance_ + pos = list(range(1, len(args.metrics)+1)) + plt.clf() + plt.cla() + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + bar_eig = ax.bar(pos, eigenvalues, align='center', tick_label=principalDf.columns) + ax.set_xlabel('Principal Components', fontsize=10) + ax.set_ylabel('Eigenvalues', fontsize=10) + ax.set_title('Eigenvalues for each principal components.', fontsize=10) + autolabel(bar_eig, ax) + plt.tight_layout() + plt.savefig(f'{args.output}/eigenvalues.pdf') + + # Plot the explained variance. + explained_var = pca.explained_variance_ratio_ + plt.clf() + plt.cla() + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + bar_var = ax.bar(pos, explained_var, align='center', tick_label=principalDf.columns) + ax.set_xlabel('Principal Components', fontsize=10) + ax.set_ylabel('Explained variance', fontsize=10) + ax.set_title('Amount of explained variance for all principal components.', fontsize=10) + autolabel(bar_var, ax) + plt.tight_layout() + plt.savefig(f'{args.output}/explained_variance.pdf') + + # Plot the contribution of each measures to principal component. + component = pca.components_ + output_component = pd.DataFrame(component, index=principalDf.columns, columns=args.metrics) + output_component.to_excel(f'{args.output}/loadings.xlsx', index=True, header=True) + plt.clf() + plt.cla() + fig, axs = plt.subplots(2) + fig.suptitle('Graph of the contribution of each measures to the first and second principal component.', fontsize=10) + bar_pc1 = axs[0].bar(pos, component[0], align='center', tick_label=args.metrics) + bar_pc2 = axs[1].bar(pos, component[1], align='center', tick_label=args.metrics) + autolabel(bar_pc1, axs[0]) + autolabel(bar_pc2, axs[1]) + for ax in axs.flat: + ax.set(xlabel='Diffusion measures', ylabel='Loadings') + for ax in axs.flat: + ax.label_outer() + plt.tight_layout() + plt.savefig(f'{args.output}/contribution.pdf') + + # Extract the derived newly computed measures from the PCA analysis. + logging.debug('Saving matrices for PC with eigenvalues > 1...') + out = pca.transform(x) + out = restore_zero_values(x, out) + out = out.swapaxes(0, 1).reshape(len(args.metrics), len(subjects), mat_shape[0], mat_shape[1]) + + # Save matrix for significant components + nb_pc = eigenvalues[eigenvalues >= 1] + for i in range(0, len(nb_pc)): + for s in range(0, len(subjects)): + save_matrix_in_any_format(f'{args.in_folder}/{subjects[s]}/Compute_Connectivity/PC{i+1}.npy', + out[i, s, :, :]) + + +if __name__ == "__main__": + main() From 12407bce1ec1fa7cf85c3b90b3852f5165e4692e Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 2 Feb 2023 14:47:23 -0500 Subject: [PATCH 02/22] Added alternative input structure support --- scripts/scil_compute_pca.py | 98 +++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 38 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index b42bd74a1..3472bcea7 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -6,27 +6,25 @@ (e.g. presenting eigenvalues > 1) in a connectivity matrix format. This script can take into account all edges from every subject in a population or only non-zero edges across all subjects. -Designed to work best with a Connectoflow output regardless of the atlas used for segmentation. If the data -isn't a connectoflow output, please create an input structure like this : - ${input}/${subid}/Compute_Connectivity/$metric[1] - /$metric[2] - /... - /${subid}/Compute_Connectivity/$metric[1] - /$metric[2] - /... - /... +The script can take directly as input a connectoflow output folder. Simply use the --connectoflow flag. For +other type of folder input, the script expect a single folder containing all matrices for all subjects. Example: + $input_folder/sub-01_ad.npy + /sub-01_md.npy + /sub-02_ad.npy + /sub-03_md.npy + /... Output connectivity matrix will be saved next to the other metrics in the input folder. The graphs and tables will be outputted in the designed folder in the argument. EXAMPLE USAGE: -dimension_reduction.py input_folder/ output_folder/ --metrics ad fa md rd afd_fixel afd_total nufo - --ids list_ids.txt --common true +dimension_reduction.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true """ # Import required libraries. import argparse import logging +import os import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler @@ -54,19 +52,21 @@ def _build_arg_parser(): formatter_class=argparse.RawTextHelpFormatter, epilog=EPILOG) - p.add_argument('in_folder', - help='Path to the Connectoflow output folder.') + p.add_argument('input', + help='Path to the input folder.') p.add_argument('output', help='Path to the output folder to export graphs and tables. \n' '*** Please note, PC connectivity matrix will be outputted in the original input folder' 'next to all other metrics ***') - p.add_argument('--metrics', nargs='+', + p.add_argument('--metrics', nargs='+', required=True, help='List of all metrics to include in PCA analysis.') - p.add_argument('--ids', - help="Txt file containing all subject's id.") - p.add_argument('--common', choices=['true', 'false'], default='true', type=str, + p.add_argument('--list_ids', + help='List containing all ids to use in PCA computation.') + p.add_argument('--common', choices=['true', 'false'], default='true', help='If true, will include only connections found in all subjects of the population (Recommended) ' '[True].') + p.add_argument('--connectoflow', action='store_true', + help='If true, script will assume the input folder is a Connectoflow output.') add_verbose_arg(p) add_overwrite_arg(p) @@ -76,7 +76,7 @@ def _build_arg_parser(): def creating_pca_input(d): """ - Script to create PCA input from matrix. + Function to create PCA input from matrix. :param d: Dictionary containing all matrix for all metrics. :return: Numpy array. """ @@ -94,7 +94,7 @@ def creating_pca_input(d): def restore_zero_values(orig, new): """ - Script to restore 0 values in a numpy array. + Function to restore 0 values in a numpy array. :param orig: Original numpy array containing 0 values to restore. :param new: Array in which 0 values need to be restored. :return: Numpy array with data from the new array but zeros from the original array. @@ -162,22 +162,43 @@ def apply_binary_mask(d, mask): return d +def grab_files_by_end(end_pattern, dir): + """ + Function to grab files following an ending pattern. + :param end_pattern: Ending pattern to match. + :param dir: Directory. + :return: List of filenames matching the end pattern. + """ + files = [f for f in os.listdir(dir) if f.endswith(end_pattern)] + + return sorted(files) + + def main(): parser = _build_arg_parser() args = parser.parse_args() if args.verbose: - logging.basicConfig(level=logging.DEBUG) + logging.getLogger().setLevel(logging.INFO) - assert_inputs_exist(parser, args.input) assert_output_dirs_exist_and_empty(parser, args, args.output) - subjects = open(args.ids).read().split() - - # Loading all matrix. - logging.debug('Loading all matrices...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') for a in subjects] - for m in args.metrics} + subjects = open(args.list_ids).read().split() + + if args.connectoflow: + # Loading all matrix. + logging.info('Loading all matrices from a Connectoflow output...') + d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}/Compute_Connectivity/{m}.npy') for a in subjects] + for m in args.metrics} + else: + logging.info('Loading all matrices...') + d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}') for a in grab_files_by_end(f'{m}.npy', f'{args.input}')] + for m in args.metrics} + # Assert that all metrics have the same number of subjects. + nb_sub = [len(d[f'{m}']) for m in args.metrics] + assert all(x == len(subjects) for x in nb_sub), "Error, the number of matrices for each metric doesn't match" \ + "the number of subject in the id list." \ + "Please validate input folder." # Setting individual matrix shape. mat_shape = d[f'{args.metrics[0]}'][0].shape @@ -193,29 +214,29 @@ def main(): parser.error("Different binary mask have been generated from 2 different metrics, \n " "please validate input data.") else: - logging.debug('Data shows {} common connections across the population.', format(len(m1))) + logging.info('Data shows {} common connections across the population.'.format(len(m1))) d = apply_binary_mask(d, m1) # Creating input structure. - logging.debug('Creating PCA input structure...') + logging.info('Creating PCA input structure...') df = creating_pca_input(d) df[df == 0] = 'nan' # Standardized the data. - logging.debug('Standardizing data...') + logging.info('Standardizing data...') x = StandardScaler().fit_transform(df) df_pca = x[~np.isnan(x).any(axis=1)] x = np.nan_to_num(x, nan=0.) # Perform the PCA. - logging.debug('Performing PCA...') + logging.info('Performing PCA...') pca = PCA(n_components=len(args.metrics)) principalComponents = pca.fit_transform(df_pca) principalDf = pd.DataFrame(data=principalComponents, columns=[f'PC{i}' for i in range(1, len(args.metrics) + 1)]) # Plot the eigenvalues. - logging.debug('Plotting results...') + logging.info('Plotting results...') eigenvalues = pca.explained_variance_ pos = list(range(1, len(args.metrics)+1)) plt.clf() @@ -227,7 +248,6 @@ def main(): ax.set_ylabel('Eigenvalues', fontsize=10) ax.set_title('Eigenvalues for each principal components.', fontsize=10) autolabel(bar_eig, ax) - plt.tight_layout() plt.savefig(f'{args.output}/eigenvalues.pdf') # Plot the explained variance. @@ -241,7 +261,6 @@ def main(): ax.set_ylabel('Explained variance', fontsize=10) ax.set_title('Amount of explained variance for all principal components.', fontsize=10) autolabel(bar_var, ax) - plt.tight_layout() plt.savefig(f'{args.output}/explained_variance.pdf') # Plot the contribution of each measures to principal component. @@ -260,11 +279,10 @@ def main(): ax.set(xlabel='Diffusion measures', ylabel='Loadings') for ax in axs.flat: ax.label_outer() - plt.tight_layout() plt.savefig(f'{args.output}/contribution.pdf') # Extract the derived newly computed measures from the PCA analysis. - logging.debug('Saving matrices for PC with eigenvalues > 1...') + logging.info('Saving matrices for PC with eigenvalues > 1...') out = pca.transform(x) out = restore_zero_values(x, out) out = out.swapaxes(0, 1).reshape(len(args.metrics), len(subjects), mat_shape[0], mat_shape[1]) @@ -273,8 +291,12 @@ def main(): nb_pc = eigenvalues[eigenvalues >= 1] for i in range(0, len(nb_pc)): for s in range(0, len(subjects)): - save_matrix_in_any_format(f'{args.in_folder}/{subjects[s]}/Compute_Connectivity/PC{i+1}.npy', - out[i, s, :, :]) + if args.connectoflow: + save_matrix_in_any_format(f'{args.input}/{subjects[s]}/Compute_Connectivity/PC{i+1}.npy', + out[i, s, :, :]) + else: + save_matrix_in_any_format(f'{args.input}/{subjects[s]}_PC{i+1}.npy', + out[i, s, :, :]) if __name__ == "__main__": From 21eccf44ce8d38b2c13b7f366a0cfa3a7095f162 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 2 Feb 2023 15:02:53 -0500 Subject: [PATCH 03/22] Fix typo in script description --- scripts/scil_compute_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 3472bcea7..63414b5fb 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -11,7 +11,7 @@ $input_folder/sub-01_ad.npy /sub-01_md.npy /sub-02_ad.npy - /sub-03_md.npy + /sub-02_md.npy /... Output connectivity matrix will be saved next to the other metrics in the input folder. The graphs and tables From de01fe703b3c1dcad248a48bdfff5833af6c3086 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 2 Feb 2023 15:03:56 -0500 Subject: [PATCH 04/22] Added required = true to --list_ids --- scripts/scil_compute_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 63414b5fb..f600ca014 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -60,7 +60,7 @@ def _build_arg_parser(): 'next to all other metrics ***') p.add_argument('--metrics', nargs='+', required=True, help='List of all metrics to include in PCA analysis.') - p.add_argument('--list_ids', + p.add_argument('--list_ids', required=True, help='List containing all ids to use in PCA computation.') p.add_argument('--common', choices=['true', 'false'], default='true', help='If true, will include only connections found in all subjects of the population (Recommended) ' From 53f2cf44743740686c3c6e592bb662207ddfed38 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 2 Feb 2023 15:09:28 -0500 Subject: [PATCH 05/22] Fixed PEP8 errors --- scripts/scil_compute_pca.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index f600ca014..42206a490 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -34,7 +34,6 @@ save_matrix_in_any_format, add_verbose_arg, add_overwrite_arg, - assert_inputs_exist, assert_output_dirs_exist_and_empty) @@ -61,7 +60,7 @@ def _build_arg_parser(): p.add_argument('--metrics', nargs='+', required=True, help='List of all metrics to include in PCA analysis.') p.add_argument('--list_ids', required=True, - help='List containing all ids to use in PCA computation.') + help='List containing all ids to use in PCA computation.') p.add_argument('--common', choices=['true', 'false'], default='true', help='If true, will include only connections found in all subjects of the population (Recommended) ' '[True].') @@ -115,7 +114,7 @@ def autolabel(rects, axs): for rect in rects: height = rect.get_height() axs.text(rect.get_x() + rect.get_width()/2., height*1.05, - '%.3f' % float(height), ha='center', va='bottom') + '%.3f' % float(height), ha='center', va='bottom') def extracting_common_cnx(d, ind): @@ -162,14 +161,14 @@ def apply_binary_mask(d, mask): return d -def grab_files_by_end(end_pattern, dir): +def grab_files_by_end(end_pattern, folder): """ Function to grab files following an ending pattern. :param end_pattern: Ending pattern to match. - :param dir: Directory. + :param folder: Directory. :return: List of filenames matching the end pattern. """ - files = [f for f in os.listdir(dir) if f.endswith(end_pattern)] + files = [f for f in os.listdir(folder) if f.endswith(end_pattern)] return sorted(files) @@ -192,7 +191,8 @@ def main(): for m in args.metrics} else: logging.info('Loading all matrices...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}') for a in grab_files_by_end(f'{m}.npy', f'{args.input}')] + d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}') + for a in grab_files_by_end(f'{m}.npy', f'{args.input}')] for m in args.metrics} # Assert that all metrics have the same number of subjects. nb_sub = [len(d[f'{m}']) for m in args.metrics] @@ -232,8 +232,8 @@ def main(): # Perform the PCA. logging.info('Performing PCA...') pca = PCA(n_components=len(args.metrics)) - principalComponents = pca.fit_transform(df_pca) - principalDf = pd.DataFrame(data=principalComponents, columns=[f'PC{i}' for i in range(1, len(args.metrics) + 1)]) + principalcomponents = pca.fit_transform(df_pca) + principaldf = pd.DataFrame(data=principalcomponents, columns=[f'PC{i}' for i in range(1, len(args.metrics) + 1)]) # Plot the eigenvalues. logging.info('Plotting results...') @@ -243,7 +243,7 @@ def main(): plt.cla() fig = plt.figure() ax = fig.add_subplot(1, 1, 1) - bar_eig = ax.bar(pos, eigenvalues, align='center', tick_label=principalDf.columns) + bar_eig = ax.bar(pos, eigenvalues, align='center', tick_label=principaldf.columns) ax.set_xlabel('Principal Components', fontsize=10) ax.set_ylabel('Eigenvalues', fontsize=10) ax.set_title('Eigenvalues for each principal components.', fontsize=10) @@ -256,7 +256,7 @@ def main(): plt.cla() fig = plt.figure() ax = fig.add_subplot(1, 1, 1) - bar_var = ax.bar(pos, explained_var, align='center', tick_label=principalDf.columns) + bar_var = ax.bar(pos, explained_var, align='center', tick_label=principaldf.columns) ax.set_xlabel('Principal Components', fontsize=10) ax.set_ylabel('Explained variance', fontsize=10) ax.set_title('Amount of explained variance for all principal components.', fontsize=10) @@ -265,7 +265,7 @@ def main(): # Plot the contribution of each measures to principal component. component = pca.components_ - output_component = pd.DataFrame(component, index=principalDf.columns, columns=args.metrics) + output_component = pd.DataFrame(component, index=principaldf.columns, columns=args.metrics) output_component.to_excel(f'{args.output}/loadings.xlsx', index=True, header=True) plt.clf() plt.cla() From a03ccdec6fae0646cf74a455c7ce40d4b5ba0703 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 2 Feb 2023 15:10:48 -0500 Subject: [PATCH 06/22] Fixed PEP8 errors --- scripts/tests/test_compute_pca.py | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 scripts/tests/test_compute_pca.py diff --git a/scripts/tests/test_compute_pca.py b/scripts/tests/test_compute_pca.py new file mode 100644 index 000000000..1dafd1300 --- /dev/null +++ b/scripts/tests/test_compute_pca.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import tempfile + +from scilpy.io.fetcher import get_testing_files_dict, fetch_data, get_home + + +# If they already exist, this only takes 5 seconds (check md5sum) +fetch_data(get_testing_files_dict(), keys=['connectivity.zip']) +tmp_dir = tempfile.TemporaryDirectory() + + +def test_help_option(script_runner): + ret = script_runner.run('scil_compute_pca.py', + '--help') + assert ret.success + + +def test_execution_pca(script_runner): + os.chdir(os.path.expanduser(tmp_dir.name)) + afd_max = os.path.join(get_home(), 'connectivity', + 'afd_max.npy') + length = os.path.join(get_home(), 'connectivity', + 'len.npy') + sc = os.path.join(get_home(), 'connectivity', + 'sc.npy') + vol = os.path.join(get_home(), 'connectivity', + 'vol.npy') + sim = os.path.join(get_home(), 'connectivity', + 'sim.npy') + ret = script_runner.run('scil_compute_pca.py', './', './', '--metrics', afd_max, + length, sc, vol, sim) + assert ret.success From 6fc6944a2e22e126af00fb64d1e604e996c1657a Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Mon, 20 Feb 2023 15:23:33 -0500 Subject: [PATCH 07/22] Fixed typo --- scripts/scil_compute_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 42206a490..6eb205138 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -15,7 +15,7 @@ /... Output connectivity matrix will be saved next to the other metrics in the input folder. The graphs and tables -will be outputted in the designed folder in the argument. +will be outputted in the designated folder in the argument. EXAMPLE USAGE: dimension_reduction.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true From c4b8da3a8efefb761462ecb88808a6c2d352eef6 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:57:12 -0500 Subject: [PATCH 08/22] Added new test data and fixed test functions --- scilpy/io/fetcher.py | 2 +- scripts/tests/test_compute_pca.py | 21 ++++++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/scilpy/io/fetcher.py b/scilpy/io/fetcher.py index 2f91ca298..917fc151e 100644 --- a/scilpy/io/fetcher.py +++ b/scilpy/io/fetcher.py @@ -99,7 +99,7 @@ def get_testing_files_dict(): '3e27625a1e7f2484b7fa5028c95324cc'], 'stats.zip': ['1vsM7xuU0jF5fL5PIgN6stAH7oO683tw0', - 'bcc21835cf0bf7210bdc99ba5d8df44b'], + 'fdad170bcc9d19a2e53b0f29fd793ad4'], 'anatomical_filtering.zip': ['1Li8DdySnMnO9Gich4pilhXisjkjz1-Dy', '6f0eff5154ff0973a3dc26db00e383ea'], diff --git a/scripts/tests/test_compute_pca.py b/scripts/tests/test_compute_pca.py index 1dafd1300..b2eb472c5 100644 --- a/scripts/tests/test_compute_pca.py +++ b/scripts/tests/test_compute_pca.py @@ -8,7 +8,7 @@ # If they already exist, this only takes 5 seconds (check md5sum) -fetch_data(get_testing_files_dict(), keys=['connectivity.zip']) +fetch_data(get_testing_files_dict(), keys=['stats.zip']) tmp_dir = tempfile.TemporaryDirectory() @@ -20,16 +20,11 @@ def test_help_option(script_runner): def test_execution_pca(script_runner): os.chdir(os.path.expanduser(tmp_dir.name)) - afd_max = os.path.join(get_home(), 'connectivity', - 'afd_max.npy') - length = os.path.join(get_home(), 'connectivity', - 'len.npy') - sc = os.path.join(get_home(), 'connectivity', - 'sc.npy') - vol = os.path.join(get_home(), 'connectivity', - 'vol.npy') - sim = os.path.join(get_home(), 'connectivity', - 'sim.npy') - ret = script_runner.run('scil_compute_pca.py', './', './', '--metrics', afd_max, - length, sc, vol, sim) + input_folder = os.path.join(get_home(), 'pca') + output_folder = os.path.join(get_home(), 'pca_out') + ids = os.path.join(get_home(), 'pca', + 'list_id.txt') + ret = script_runner.run('scil_compute_pca.py', input_folder, output_folder, '--metrics', 'ad', + 'fa', 'md', 'rd', 'nufo', 'afd_total', 'afd_fixel', '--list_ids', + ids, '--common', 'true') assert ret.success From 73b6bdc4da101289534da783d763bca20f9ada5c Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Wed, 1 Mar 2023 14:07:48 -0500 Subject: [PATCH 09/22] Modified handling of subjects files and removed unused function --- scripts/scil_compute_pca.py | 40 ++++++++++++------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 6eb205138..791f1a1c6 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -7,15 +7,15 @@ edges from every subject in a population or only non-zero edges across all subjects. The script can take directly as input a connectoflow output folder. Simply use the --connectoflow flag. For -other type of folder input, the script expect a single folder containing all matrices for all subjects. Example: - $input_folder/sub-01_ad.npy - /sub-01_md.npy - /sub-02_ad.npy - /sub-02_md.npy - /... +other type of folder input, the script expects a single folder containing all matrices for all subjects. Example: + input_folder/sub-01_ad.npy + /sub-01_md.npy + /sub-02_ad.npy + /sub-02_md.npy + /... -Output connectivity matrix will be saved next to the other metrics in the input folder. The graphs and tables -will be outputted in the designated folder in the argument. +Output connectivity matrix will be saved next to the other metrics in the input folder. The plots and tables +will be outputted in the designated folder from the argument. EXAMPLE USAGE: dimension_reduction.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true @@ -24,7 +24,6 @@ # Import required libraries. import argparse import logging -import os import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler @@ -73,7 +72,7 @@ def _build_arg_parser(): return p -def creating_pca_input(d): +def generate_pca_input(d): """ Function to create PCA input from matrix. :param d: Dictionary containing all matrix for all metrics. @@ -161,18 +160,6 @@ def apply_binary_mask(d, mask): return d -def grab_files_by_end(end_pattern, folder): - """ - Function to grab files following an ending pattern. - :param end_pattern: Ending pattern to match. - :param folder: Directory. - :return: List of filenames matching the end pattern. - """ - files = [f for f in os.listdir(folder) if f.endswith(end_pattern)] - - return sorted(files) - - def main(): parser = _build_arg_parser() args = parser.parse_args() @@ -180,7 +167,7 @@ def main(): if args.verbose: logging.getLogger().setLevel(logging.INFO) - assert_output_dirs_exist_and_empty(parser, args, args.output) + assert_output_dirs_exist_and_empty(parser, args, args.output, create_dir=True) subjects = open(args.list_ids).read().split() @@ -191,8 +178,7 @@ def main(): for m in args.metrics} else: logging.info('Loading all matrices...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}') - for a in grab_files_by_end(f'{m}.npy', f'{args.input}')] + d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}_{m}.npy') for a in subjects] for m in args.metrics} # Assert that all metrics have the same number of subjects. nb_sub = [len(d[f'{m}']) for m in args.metrics] @@ -214,13 +200,13 @@ def main(): parser.error("Different binary mask have been generated from 2 different metrics, \n " "please validate input data.") else: - logging.info('Data shows {} common connections across the population.'.format(len(m1))) + logging.info('Data shows {} common connections across the population.'.format(np.sum(m1))) d = apply_binary_mask(d, m1) # Creating input structure. logging.info('Creating PCA input structure...') - df = creating_pca_input(d) + df = generate_pca_input(d) df[df == 0] = 'nan' # Standardized the data. From 2ae9bf666da1c936ab0dd8d0c25b58f0187813ed Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Thu, 2 Mar 2023 13:37:24 -0500 Subject: [PATCH 10/22] modified according to the comments --- scripts/scil_compute_pca.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 791f1a1c6..de5e00af0 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -18,7 +18,7 @@ will be outputted in the designated folder from the argument. EXAMPLE USAGE: -dimension_reduction.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true +scil_compute_pca.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true """ # Import required libraries. @@ -50,9 +50,9 @@ def _build_arg_parser(): formatter_class=argparse.RawTextHelpFormatter, epilog=EPILOG) - p.add_argument('input', + p.add_argument('in_folder', help='Path to the input folder.') - p.add_argument('output', + p.add_argument('out_folder', help='Path to the output folder to export graphs and tables. \n' '*** Please note, PC connectivity matrix will be outputted in the original input folder' 'next to all other metrics ***') @@ -167,18 +167,18 @@ def main(): if args.verbose: logging.getLogger().setLevel(logging.INFO) - assert_output_dirs_exist_and_empty(parser, args, args.output, create_dir=True) + assert_output_dirs_exist_and_empty(parser, args, args.out_folder, create_dir=True) subjects = open(args.list_ids).read().split() if args.connectoflow: # Loading all matrix. logging.info('Loading all matrices from a Connectoflow output...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}/Compute_Connectivity/{m}.npy') for a in subjects] + d = {f'{m}': [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') for a in subjects] for m in args.metrics} else: logging.info('Loading all matrices...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.input}/{a}_{m}.npy') for a in subjects] + d = {f'{m}': [load_matrix_in_any_format(f'{args.in_folder}/{a}_{m}.npy') for a in subjects] for m in args.metrics} # Assert that all metrics have the same number of subjects. nb_sub = [len(d[f'{m}']) for m in args.metrics] @@ -234,7 +234,7 @@ def main(): ax.set_ylabel('Eigenvalues', fontsize=10) ax.set_title('Eigenvalues for each principal components.', fontsize=10) autolabel(bar_eig, ax) - plt.savefig(f'{args.output}/eigenvalues.pdf') + plt.savefig(f'{args.out_folder}/eigenvalues.pdf') # Plot the explained variance. explained_var = pca.explained_variance_ratio_ @@ -247,12 +247,12 @@ def main(): ax.set_ylabel('Explained variance', fontsize=10) ax.set_title('Amount of explained variance for all principal components.', fontsize=10) autolabel(bar_var, ax) - plt.savefig(f'{args.output}/explained_variance.pdf') + plt.savefig(f'{args.out_folder}/explained_variance.pdf') # Plot the contribution of each measures to principal component. component = pca.components_ output_component = pd.DataFrame(component, index=principaldf.columns, columns=args.metrics) - output_component.to_excel(f'{args.output}/loadings.xlsx', index=True, header=True) + output_component.to_excel(f'{args.out_folder}/loadings.xlsx', index=True, header=True) plt.clf() plt.cla() fig, axs = plt.subplots(2) @@ -265,7 +265,7 @@ def main(): ax.set(xlabel='Diffusion measures', ylabel='Loadings') for ax in axs.flat: ax.label_outer() - plt.savefig(f'{args.output}/contribution.pdf') + plt.savefig(f'{args.out_folder}/contribution.pdf') # Extract the derived newly computed measures from the PCA analysis. logging.info('Saving matrices for PC with eigenvalues > 1...') @@ -278,10 +278,10 @@ def main(): for i in range(0, len(nb_pc)): for s in range(0, len(subjects)): if args.connectoflow: - save_matrix_in_any_format(f'{args.input}/{subjects[s]}/Compute_Connectivity/PC{i+1}.npy', + save_matrix_in_any_format(f'{args.in_folder}/{subjects[s]}/Compute_Connectivity/PC{i+1}.npy', out[i, s, :, :]) else: - save_matrix_in_any_format(f'{args.input}/{subjects[s]}_PC{i+1}.npy', + save_matrix_in_any_format(f'{args.in_folder}/{subjects[s]}_PC{i+1}.npy', out[i, s, :, :]) From 04e3ada386a66edf077d60a23adfc4db66d04940 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Mon, 6 Mar 2023 10:35:27 -0500 Subject: [PATCH 11/22] added overwrite option in test. --- scripts/tests/test_compute_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tests/test_compute_pca.py b/scripts/tests/test_compute_pca.py index b2eb472c5..131f3589d 100644 --- a/scripts/tests/test_compute_pca.py +++ b/scripts/tests/test_compute_pca.py @@ -26,5 +26,5 @@ def test_execution_pca(script_runner): 'list_id.txt') ret = script_runner.run('scil_compute_pca.py', input_folder, output_folder, '--metrics', 'ad', 'fa', 'md', 'rd', 'nufo', 'afd_total', 'afd_fixel', '--list_ids', - ids, '--common', 'true') + ids, '--common', 'true', '-f') assert ret.success From 0e201041a14d98795eb336858709496ef6d2957e Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 09:48:32 -0500 Subject: [PATCH 12/22] Updated example input structure --- scripts/scil_compute_pca.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index de5e00af0..e78583448 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -7,12 +7,14 @@ edges from every subject in a population or only non-zero edges across all subjects. The script can take directly as input a connectoflow output folder. Simply use the --connectoflow flag. For -other type of folder input, the script expects a single folder containing all matrices for all subjects. Example: - input_folder/sub-01_ad.npy - /sub-01_md.npy - /sub-02_ad.npy - /sub-02_md.npy - /... +other type of folder input, the script expects a single folder containing all matrices for all subjects. +Example: + |--- in_folder + | |--- sub-01_ad.npy + | |--- sub-01_md.npy + | |--- sub-02_ad.npy + | |--- sub-02_md.npy + | |--- ... Output connectivity matrix will be saved next to the other metrics in the input folder. The plots and tables will be outputted in the designated folder from the argument. From d4968d3fc484b0c9d4eba006c6ff948bbfb07a36 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 09:49:37 -0500 Subject: [PATCH 13/22] Updated example input structure --- scripts/scil_compute_pca.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index e78583448..610b15459 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -9,12 +9,12 @@ The script can take directly as input a connectoflow output folder. Simply use the --connectoflow flag. For other type of folder input, the script expects a single folder containing all matrices for all subjects. Example: - |--- in_folder - | |--- sub-01_ad.npy - | |--- sub-01_md.npy - | |--- sub-02_ad.npy - | |--- sub-02_md.npy - | |--- ... + [in_folder] + |--- sub-01_ad.npy + |--- sub-01_md.npy + |--- sub-02_ad.npy + |--- sub-02_md.npy + |--- ... Output connectivity matrix will be saved next to the other metrics in the input folder. The plots and tables will be outputted in the designated folder from the argument. From 121d64cca6ed9323ca2132f02ce93c11e04044fb Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 10:13:09 -0500 Subject: [PATCH 14/22] Added interpretation example --- scripts/scil_compute_pca.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 610b15459..9edba7155 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -19,6 +19,12 @@ Output connectivity matrix will be saved next to the other metrics in the input folder. The plots and tables will be outputted in the designated folder from the argument. +Interpretation of resulting principal components can be done by evaluating the loadings values for each metrics. +A value near 0 means that this metric doesn't contribute to this specific component whereas high positive or +negative values mean a larger contribution. Components can then be labeled based on which metric contributes +the highest. For example, a principal component showing a high loading for afd_fixel and near 0 loading for all +other metrics can be interpreted as axonal density (see Gagnon et al. 2022 for this specific example). + EXAMPLE USAGE: scil_compute_pca.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true """ @@ -42,6 +48,9 @@ [1] Chamberland M, Raven EP, Genc S, Duffy K, Descoteaux M, Parker GD, Tax CMW, Jones DK. Dimensionality reduction of diffusion MRI measures for improved tractometry of the human brain. Neuroimage. 2019 Oct 15;200:89-100. doi: 10.1016/j.neuroimage.2019.06.020. Epub 2019 Jun 20. PMID: 31228638; PMCID: PMC6711466. +[2] Gagnon A., Grenier G., Bocti C., Gillet V., Lepage J.-F., Baccarelli A. A., Posner J., Descoteaux M., + & Takser L. (2022). White matter microstructural variability linked to differential attentional skills + and impulsive behavior in a pediatric population. Cerebral Cortex. https://doi.org/10.1093/cercor/bhac180 """ From 96f225b5b67da657fa691ae40a9e32ea8b689b5c Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 10:15:58 -0500 Subject: [PATCH 15/22] Fix import order --- scripts/scil_compute_pca.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 9edba7155..dbeeab841 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -32,11 +32,13 @@ # Import required libraries. import argparse import logging + +import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA -import matplotlib.pyplot as plt + from scilpy.io.utils import (load_matrix_in_any_format, save_matrix_in_any_format, add_verbose_arg, From a5425ebea579a18debfa2811d3732e0919c045d3 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 10:34:25 -0500 Subject: [PATCH 16/22] Change saving directory for output folder --- scripts/scil_compute_pca.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index dbeeab841..4eee5ff5a 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -16,14 +16,17 @@ |--- sub-02_md.npy |--- ... -Output connectivity matrix will be saved next to the other metrics in the input folder. The plots and tables -will be outputted in the designated folder from the argument. +The plots, tables and principal components matrices will be outputted in the designated folder from the + argument. If you want to move back your principal components matrices in your connectoflow +output, you can use a similar bash command for all principal components: +for sub in `cat list_id.txt`; do cp out_folder/${sub}_PC1.npy connectoflow_output/$sub/Compute_Connectivity/; done Interpretation of resulting principal components can be done by evaluating the loadings values for each metrics. A value near 0 means that this metric doesn't contribute to this specific component whereas high positive or negative values mean a larger contribution. Components can then be labeled based on which metric contributes the highest. For example, a principal component showing a high loading for afd_fixel and near 0 loading for all -other metrics can be interpreted as axonal density (see Gagnon et al. 2022 for this specific example). +other metrics can be interpreted as axonal density (see Gagnon et al. 2022 for this specific example or +ref [3] for an introduction to PCA). EXAMPLE USAGE: scil_compute_pca.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true @@ -53,6 +56,7 @@ [2] Gagnon A., Grenier G., Bocti C., Gillet V., Lepage J.-F., Baccarelli A. A., Posner J., Descoteaux M., & Takser L. (2022). White matter microstructural variability linked to differential attentional skills and impulsive behavior in a pediatric population. Cerebral Cortex. https://doi.org/10.1093/cercor/bhac180 +[3] https://towardsdatascience.com/what-are-pca-loadings-and-biplots-9a7897f2e559 """ @@ -66,9 +70,7 @@ def _build_arg_parser(): p.add_argument('in_folder', help='Path to the input folder.') p.add_argument('out_folder', - help='Path to the output folder to export graphs and tables. \n' - '*** Please note, PC connectivity matrix will be outputted in the original input folder' - 'next to all other metrics ***') + help='Path to the output folder to export graphs, tables and principal components matrices.') p.add_argument('--metrics', nargs='+', required=True, help='List of all metrics to include in PCA analysis.') p.add_argument('--list_ids', required=True, @@ -290,12 +292,8 @@ def main(): nb_pc = eigenvalues[eigenvalues >= 1] for i in range(0, len(nb_pc)): for s in range(0, len(subjects)): - if args.connectoflow: - save_matrix_in_any_format(f'{args.in_folder}/{subjects[s]}/Compute_Connectivity/PC{i+1}.npy', - out[i, s, :, :]) - else: - save_matrix_in_any_format(f'{args.in_folder}/{subjects[s]}_PC{i+1}.npy', - out[i, s, :, :]) + save_matrix_in_any_format(f'{args.out_folder}/{subjects[s]}_PC{i+1}.npy', + out[i, s, :, :]) if __name__ == "__main__": From 6c784566ce9b249f592de16ef460c6b6142ee9e9 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 10:44:52 -0500 Subject: [PATCH 17/22] Specifying help for --metrics and --list_ids --- scripts/scil_compute_pca.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 4eee5ff5a..f63cc1ea6 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -72,10 +72,11 @@ def _build_arg_parser(): p.add_argument('out_folder', help='Path to the output folder to export graphs, tables and principal components matrices.') p.add_argument('--metrics', nargs='+', required=True, - help='List of all metrics to include in PCA analysis.') - p.add_argument('--list_ids', required=True, - help='List containing all ids to use in PCA computation.') - p.add_argument('--common', choices=['true', 'false'], default='true', + help='Suffixes of all metrics to include in PCA analysis (ex: ad md fa rd). They must be ' + 'immediately followed by the .npy extension.') + p.add_argument('--list_ids', required=True, metavar='FILE', + help='Path to a .txt file containing a list of all ids.') + p.add_argument('--only_common', action='store_true', help='If true, will include only connections found in all subjects of the population (Recommended) ' '[True].') p.add_argument('--connectoflow', action='store_true', From d908c1391820d6d9e1d8de6eaa584ab779be2294 Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 10:48:24 -0500 Subject: [PATCH 18/22] Changing --common args to --not_only_common --- scripts/scil_compute_pca.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index f63cc1ea6..acd763040 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -76,9 +76,8 @@ def _build_arg_parser(): 'immediately followed by the .npy extension.') p.add_argument('--list_ids', required=True, metavar='FILE', help='Path to a .txt file containing a list of all ids.') - p.add_argument('--only_common', action='store_true', - help='If true, will include only connections found in all subjects of the population (Recommended) ' - '[True].') + p.add_argument('--not_only_common', action='store_true', + help='If true, will include all edges from all subjects and not only common edges (Not recommended)') p.add_argument('--connectoflow', action='store_true', help='If true, script will assume the input folder is a Connectoflow output.') @@ -205,7 +204,11 @@ def main(): # Setting individual matrix shape. mat_shape = d[f'{args.metrics[0]}'][0].shape - if args.common == 'true': + if args.not_only_common: + # Creating input structure using all edges from all subjects. + logging.info('Creating PCA input structure with all edges...') + df = generate_pca_input(d) + else: m1 = extracting_common_cnx(d, 0) m2 = extracting_common_cnx(d, 1) @@ -220,9 +223,11 @@ def main(): d = apply_binary_mask(d, m1) - # Creating input structure. - logging.info('Creating PCA input structure...') - df = generate_pca_input(d) + # Creating input structure. + logging.info('Creating PCA input structure with common edges...') + df = generate_pca_input(d) + + # Setting 0 values to nan. df[df == 0] = 'nan' # Standardized the data. From 2094aa453873a35d5661eae8c1f2916edc00340d Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 11:12:20 -0500 Subject: [PATCH 19/22] added specifications in function and removed the f'{}' when unnecessary' --- scripts/scil_compute_pca.py | 67 ++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index acd763040..62ac144d2 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -87,22 +87,20 @@ def _build_arg_parser(): return p -def generate_pca_input(d): +def generate_pca_input(dictionary): """ Function to create PCA input from matrix. - :param d: Dictionary containing all matrix for all metrics. - :return: Numpy array. + :param dictionary: Dictionary with metrics as keys containing list of matrices sorted by ids. + :return: Numpy array. """ - for a in d.keys(): - mat = np.rollaxis(np.array(d[f'{a}']), axis=1, start=3) + for metric in dictionary.keys(): + mat = np.rollaxis(np.array(dictionary[metric]), axis=1, start=3) matrix_shape = mat.shape[1:3] n_group = mat.shape[0] mat = mat.reshape((np.prod(matrix_shape) * n_group, 1)) - d[f'{a}'] = mat + dictionary[metric] = mat - out = np.hstack([d[f'{i}'] for i in d.keys()]) - - return out + return np.hstack([dictionary[i] for i in dictionary.keys()]) def restore_zero_values(orig, new): @@ -131,16 +129,16 @@ def autolabel(rects, axs): '%.3f' % float(height), ha='center', va='bottom') -def extracting_common_cnx(d, ind): +def extracting_common_cnx(dictionary, ind): """ Function to create a binary mask representing common connections across the population containing non-zero values. - :param d: Dictionary containing all matrices for all metrics. - :param ind: Indice of the key to use to generate the binary mask from the dictionary. - :return: Binary mask. + :param dictionary: Dictionary with metrics as keys containing list of matrices sorted by ids. + :param ind: Indice of the key to use to generate the binary mask from the dictionary. + :return: Binary mask. """ - keys = list(d.keys()) - met = np.copy(d[keys[ind]]) + keys = list(dictionary.keys()) + met = np.copy(dictionary[keys[ind]]) # Replacing all non-zero values by 1. for i in range(0, len(met)): @@ -161,18 +159,18 @@ def extracting_common_cnx(d, ind): return mask_f -def apply_binary_mask(d, mask): +def apply_binary_mask(dictionary, mask): """ Function to apply a binary mask to all matrices contained in a dictionary. - :param d: Dictionary of all matrices from all metrics. - :param mask: Binary mask. - :return: Dictionary with the same shape as input. + :param dictionary: Dictionary with metrics as keys containing list of matrices sorted by ids. + :param mask: Binary mask. + :return: Dictionary with the same shape as input. """ - for a in d.keys(): - for i in range(0, len(d[f'{a}'])): - d[f'{a}'][i] = np.multiply(d[f'{a}'][i], mask) + for a in dictionary.keys(): + for i in range(0, len(dictionary[a])): + dictionary[a][i] = np.multiply(dictionary[a][i], mask) - return d + return dictionary def main(): @@ -189,28 +187,29 @@ def main(): if args.connectoflow: # Loading all matrix. logging.info('Loading all matrices from a Connectoflow output...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') for a in subjects] - for m in args.metrics} + dictionary = {m: [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') + for a in subjects] + for m in args.metrics} else: logging.info('Loading all matrices...') - d = {f'{m}': [load_matrix_in_any_format(f'{args.in_folder}/{a}_{m}.npy') for a in subjects] - for m in args.metrics} + dictionary = {m: [load_matrix_in_any_format(f'{args.in_folder}/{a}_{m}.npy') for a in subjects] + for m in args.metrics} # Assert that all metrics have the same number of subjects. - nb_sub = [len(d[f'{m}']) for m in args.metrics] + nb_sub = [len(dictionary[m]) for m in args.metrics] assert all(x == len(subjects) for x in nb_sub), "Error, the number of matrices for each metric doesn't match" \ "the number of subject in the id list." \ "Please validate input folder." # Setting individual matrix shape. - mat_shape = d[f'{args.metrics[0]}'][0].shape + mat_shape = dictionary[args.metrics[0]][0].shape if args.not_only_common: # Creating input structure using all edges from all subjects. logging.info('Creating PCA input structure with all edges...') - df = generate_pca_input(d) + df = generate_pca_input(dictionary) else: - m1 = extracting_common_cnx(d, 0) - m2 = extracting_common_cnx(d, 1) + m1 = extracting_common_cnx(dictionary, 0) + m2 = extracting_common_cnx(dictionary, 1) if m1.shape != mat_shape: parser.error("Extracted binary mask doesn't match the shape of individual input matrix.") @@ -221,11 +220,11 @@ def main(): else: logging.info('Data shows {} common connections across the population.'.format(np.sum(m1))) - d = apply_binary_mask(d, m1) + dictionary = apply_binary_mask(dictionary, m1) # Creating input structure. logging.info('Creating PCA input structure with common edges...') - df = generate_pca_input(d) + df = generate_pca_input(dictionary) # Setting 0 values to nan. df[df == 0] = 'nan' From bd090a42dc45fe56fc23619492f55a6e2c36c37b Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 11:28:00 -0500 Subject: [PATCH 20/22] Fix matplotlib plot display --- scripts/scil_compute_pca.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index 62ac144d2..c89f58b08 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -125,8 +125,12 @@ def autolabel(rects, axs): """ for rect in rects: height = rect.get_height() - axs.text(rect.get_x() + rect.get_width()/2., height*1.05, - '%.3f' % float(height), ha='center', va='bottom') + if height > 0: + axs.text(rect.get_x() + rect.get_width() / 2., (height * 1.05), + '%.3f' % float(height), ha='center', va='bottom') + else: + axs.text(rect.get_x() + rect.get_width()/2., (height*1.05) - 0.15, + '%.3f' % float(height), ha='center', va='bottom') def extracting_common_cnx(dictionary, ind): @@ -253,6 +257,7 @@ def main(): ax.set_xlabel('Principal Components', fontsize=10) ax.set_ylabel('Eigenvalues', fontsize=10) ax.set_title('Eigenvalues for each principal components.', fontsize=10) + ax.margins(0.1) autolabel(bar_eig, ax) plt.savefig(f'{args.out_folder}/eigenvalues.pdf') @@ -266,6 +271,7 @@ def main(): ax.set_xlabel('Principal Components', fontsize=10) ax.set_ylabel('Explained variance', fontsize=10) ax.set_title('Amount of explained variance for all principal components.', fontsize=10) + ax.margins(0.1) autolabel(bar_var, ax) plt.savefig(f'{args.out_folder}/explained_variance.pdf') @@ -279,6 +285,8 @@ def main(): fig.suptitle('Graph of the contribution of each measures to the first and second principal component.', fontsize=10) bar_pc1 = axs[0].bar(pos, component[0], align='center', tick_label=args.metrics) bar_pc2 = axs[1].bar(pos, component[1], align='center', tick_label=args.metrics) + axs[0].margins(0.2) + axs[1].margins(0.2) autolabel(bar_pc1, axs[0]) autolabel(bar_pc2, axs[1]) for ax in axs.flat: From d51bba3dab5abb0200d0826063e8eabb7b47723b Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 11:30:57 -0500 Subject: [PATCH 21/22] Change --connectoflow flag to --input_connectoflow --- scripts/scil_compute_pca.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/scil_compute_pca.py b/scripts/scil_compute_pca.py index c89f58b08..32313749a 100644 --- a/scripts/scil_compute_pca.py +++ b/scripts/scil_compute_pca.py @@ -6,8 +6,8 @@ (e.g. presenting eigenvalues > 1) in a connectivity matrix format. This script can take into account all edges from every subject in a population or only non-zero edges across all subjects. -The script can take directly as input a connectoflow output folder. Simply use the --connectoflow flag. For -other type of folder input, the script expects a single folder containing all matrices for all subjects. +The script can take directly as input a connectoflow output folder. Simply use the --input_connectoflow flag. +For other type of folder input, the script expects a single folder containing all matrices for all subjects. Example: [in_folder] |--- sub-01_ad.npy @@ -29,7 +29,7 @@ ref [3] for an introduction to PCA). EXAMPLE USAGE: -scil_compute_pca.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt --common true +scil_compute_pca.py input_folder/ output_folder/ --metrics ad fa md rd [...] --list_ids list_ids.txt """ # Import required libraries. @@ -78,7 +78,7 @@ def _build_arg_parser(): help='Path to a .txt file containing a list of all ids.') p.add_argument('--not_only_common', action='store_true', help='If true, will include all edges from all subjects and not only common edges (Not recommended)') - p.add_argument('--connectoflow', action='store_true', + p.add_argument('--input_connectoflow', action='store_true', help='If true, script will assume the input folder is a Connectoflow output.') add_verbose_arg(p) @@ -188,7 +188,7 @@ def main(): subjects = open(args.list_ids).read().split() - if args.connectoflow: + if args.input_connectoflow: # Loading all matrix. logging.info('Loading all matrices from a Connectoflow output...') dictionary = {m: [load_matrix_in_any_format(f'{args.in_folder}/{a}/Compute_Connectivity/{m}.npy') From 46cd79a4024a5163cb99a7291873a6af4b5b7dbd Mon Sep 17 00:00:00 2001 From: gagnonanthony <79757265+gagnonanthony@users.noreply.github.com> Date: Tue, 7 Mar 2023 11:31:34 -0500 Subject: [PATCH 22/22] Fix test to fit the args change --- scripts/tests/test_compute_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tests/test_compute_pca.py b/scripts/tests/test_compute_pca.py index 131f3589d..cb5a0ec67 100644 --- a/scripts/tests/test_compute_pca.py +++ b/scripts/tests/test_compute_pca.py @@ -26,5 +26,5 @@ def test_execution_pca(script_runner): 'list_id.txt') ret = script_runner.run('scil_compute_pca.py', input_folder, output_folder, '--metrics', 'ad', 'fa', 'md', 'rd', 'nufo', 'afd_total', 'afd_fixel', '--list_ids', - ids, '--common', 'true', '-f') + ids, '-f') assert ret.success