In [1]:

import pandas as pd
import os
import shutil

def reorganize_data(origin_path: str, sample_sheet_path: str, destination_path: str):
    # specify the subdirectory name for each modality of interest
    type_extension = {
        "Gene Expression Quantification": "gene_expression",
        "Gene Level Copy Number": "cnv",
        "Slide Image": "images",
        "Methylation Beta Value": "dna_methylation"
    }
    ## Create a directory at the destination
    if not os.path.isdir(destination_path):
        print("Generating new data path.")
        os.mkdir(destination_path)
    else:
        print("New data path already exists.")
    ## In the new directory, add folders for each case
    # Read cases
    sample_data = pd.read_csv(sample_sheet_path, sep = "\t")
    modalities = list(type_extension.values())
    cases = get_cases(sample_data)
    print("Adding case folders.")
    for case in cases:
        path = destination_path + "/"+ case
        print("Creating path", path)
        try:
            os.mkdir(path)
            mk_modalities_folders(path, modalities)
        except OSError:
            print("Creation of the directory %s failed" % path)
        else:
            print("Successfully created directory %s " % path)
    count = 0
    for index,row in sample_data.iterrows():
        assoc_case = row['Case ID'].split(",")[0].strip()
        assoc_type = row['Data Type']
        assoc_file = row['File ID']
        assoc_file_name = row['File Name']
        source = os.path.join(origin_path, assoc_file, assoc_file_name)
        if assoc_type in type_extension:
            dest = os.path.join(destination_path, assoc_case, type_extension[assoc_type])
        else:
            dest = os.path.join(destination_path, assoc_case, "other")
        print("Moving", source, "to", dest)
        try:
            shutil.move(source, dest)
            count +=1
        except:
            print("Error in moving data from original data to new data")
            print("Filename: " + assoc_case + " - " + assoc_file_name)
    print(f"Moved {count} files")

def get_cases(sample_data):
    cases_raw = sample_data['Case ID'].values.tolist()
    cases_processed = set()
    for case in cases_raw:
        cases_processed.add(case.split(",")[0].strip())
    return list(cases_processed)

def mk_modalities_folders(path, modalities):
    for modality in modalities:
        p = os.path.join(path, modality)
        os.mkdir(p)
    os.mkdir(os.path.join(path, "other"))


In [2]:
origin_path = ".../TCGA/"
sample_sheet_path = ".../TCGA/gdc_sample_sheet.DATE.tsv" #DATE -> the date when you download the data, so it varies per download
destination_path = ".../TCGA/data_by_cases"
reorganize_data(origin_path = origin_path, sample_sheet_path = sample_sheet_path, destination_path = destination_path)