In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
from openslide import open_slide
import glob

In [2]:
def filter_files(x, slides):
    if slides == 'ffpe':
        if '-DX' in x or 'rna_seq' in x:
            return True
        else:
            return False
    elif slides == 'ff':
        if '-TS' in x or 'rna_seq' in x:
            return True
        else:
            return False
            
    return None


def has_magnif(image_path):
    img = open_slide(image_path)
    magnification = img.properties.get("aperio.AppMag", None)
    if magnification in ["40", "20"]:
        return True
    else:
        return False

def get_slide_orientation(x):
    if '-TS' in x:
        return 'TS'
    elif '-BS' in x:
        return 'BS'
    elif 'rna_seq' in x:
        return 'RNA'
    elif '-DX' in x:
        return 'DX'
    else:
        return np.nan 

In [3]:
slide_type = 'ffpe' # ff
sample_sheet = 'gdc_sample_sheet.2024-10-29'

In [4]:
df = pd.read_csv(f'data/{sample_sheet}.tsv', sep='\t')
df = df[df['File Name'].apply(lambda x: filter_files(x, slides=slide_type))]
df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
2,466888ef-7b80-471e-822c-bf355ef474b8,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4815,TCGA-B0-4815-01A,Primary Tumor
5,9011fb8f-0cc0-443e-bac4-38bf21425dc9,TCGA-B0-5088-01Z-00-DX1.69bb79f8-33cc-4c9c-be6...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-5088,TCGA-B0-5088-01Z,Primary Tumor
9,3f6b7888-c7c3-4e6a-8d54-364ad608e8d0,TCGA-B0-4712-01Z-00-DX1.b584d650-4bdd-452c-992...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01Z,Primary Tumor
10,7f16d20f-07e2-4901-ae00-4da70125e8a3,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01A,Primary Tumor
11,0498e4e5-f3cc-4fa5-9686-099ab9e435f4,59fe9111-5941-4230-9795-7df8064015a5.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-11A,Solid Tissue Normal
...,...,...,...,...,...,...,...,...
2779,f98a1953-c008-4ab9-a17e-be80b5fd43fe,e3b90eb7-f2a5-4b9e-a15e-ccfd7e6431d2.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-5104,TCGA-B0-5104-01A,Primary Tumor
2780,46495ef2-6d1c-4f0a-834a-0d7bb26a26aa,TCGA-BP-4176-01Z-00-DX1.e6cbc6db-6340-4bca-b5c...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-BP-4176,TCGA-BP-4176-01Z,Primary Tumor
2781,7a3be000-2dbb-49da-9293-802a720ca1dc,TCGA-BP-4790-01Z-00-DX1.3b9a2273-c24f-4a25-b74...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-BP-4790,TCGA-BP-4790-01Z,Primary Tumor
2784,8e95a2b4-3110-4dd6-b3c9-b3e83c563282,c6dca0c8-bd9d-4299-9d18-a1054fe15aeb.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CW-6087,TCGA-CW-6087-01A,Primary Tumor


In [5]:
if slide_type == 'ffpe':
    paired_samples = df[['Case ID', 'Data Type']].value_counts().reset_index().value_counts('Case ID').reset_index()
elif slide_type == 'ff':
    paired_samples = df[['Sample ID', 'Data Type']].value_counts().reset_index().value_counts('Sample ID').reset_index()
paired_samples = paired_samples[paired_samples['count'] > 1]
paired_samples

Unnamed: 0,Case ID,count
0,TCGA-T7-A92I,2
1,TCGA-3Z-A93Z,2
2,TCGA-6D-AA2E,2
3,TCGA-A3-3306,2
4,TCGA-A3-3307,2
...,...,...
505,TCGA-A3-3349,2
506,TCGA-A3-3351,2
507,TCGA-A3-3352,2
508,TCGA-A3-3357,2


In [6]:
if slide_type == 'ffpe':
    df = df[df['Case ID'].isin(paired_samples['Case ID'].values)]
elif slide_type == 'ff':
    df = df[df['Sample ID'].isin(paired_samples['Sample ID'].values)]
df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
2,466888ef-7b80-471e-822c-bf355ef474b8,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4815,TCGA-B0-4815-01A,Primary Tumor
5,9011fb8f-0cc0-443e-bac4-38bf21425dc9,TCGA-B0-5088-01Z-00-DX1.69bb79f8-33cc-4c9c-be6...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-5088,TCGA-B0-5088-01Z,Primary Tumor
9,3f6b7888-c7c3-4e6a-8d54-364ad608e8d0,TCGA-B0-4712-01Z-00-DX1.b584d650-4bdd-452c-992...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01Z,Primary Tumor
10,7f16d20f-07e2-4901-ae00-4da70125e8a3,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01A,Primary Tumor
11,0498e4e5-f3cc-4fa5-9686-099ab9e435f4,59fe9111-5941-4230-9795-7df8064015a5.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-11A,Solid Tissue Normal
...,...,...,...,...,...,...,...,...
2774,73d97d28-fb60-4380-8ba0-34e79da74af2,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CZ-5988,TCGA-CZ-5988-11A,Solid Tissue Normal
2776,d13023f5-3316-406f-bd25-346f932d324c,TCGA-CJ-4889-01Z-00-DX1.A584930A-586C-4CD9-980...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-CJ-4889,TCGA-CJ-4889-01Z,Primary Tumor
2779,f98a1953-c008-4ab9-a17e-be80b5fd43fe,e3b90eb7-f2a5-4b9e-a15e-ccfd7e6431d2.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-5104,TCGA-B0-5104-01A,Primary Tumor
2780,46495ef2-6d1c-4f0a-834a-0d7bb26a26aa,TCGA-BP-4176-01Z-00-DX1.e6cbc6db-6340-4bca-b5c...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-BP-4176,TCGA-BP-4176-01Z,Primary Tumor


In [7]:
metadata = []
if slide_type == 'ffpe':
    iterate_over = 'Case ID'
elif slide_type == 'ff':
    iterate_over = 'Sample ID'

for iterate_over_id in df[iterate_over].unique():
    tab = df[df[iterate_over] == iterate_over_id]

    tab_rna = tab[tab['Data Type'] == 'Gene Expression Quantification']
    tab_slide = tab[tab['Data Type'] == 'Slide Image']
    

    for _, row_rna in tab_rna.iterrows():
        for _, row_slide in tab_slide.iterrows():
            if row_rna['Sample Type'] == row_slide['Sample Type']:
                metadata.append([row_slide['File Name'], row_rna['File Name'], row_rna['Case ID'], 
                                 row_slide['Sample ID'], row_rna['Sample ID'], row_slide['Sample Type']])
                
metadata = pd.DataFrame(metadata, columns=['image_path', 'rna_path', 'case_id', 'sample_slide_id', 'sample_rna_id', 'sample_type'])
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type
0,TCGA-B0-4815-01Z-00-DX1.dd230cfa-5952-4fe7-b73...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01Z,TCGA-B0-4815-01A,Primary Tumor
1,TCGA-B0-5088-01Z-00-DX1.69bb79f8-33cc-4c9c-be6...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01Z,TCGA-B0-5088-01A,Primary Tumor
2,TCGA-B0-4712-01Z-00-DX1.b584d650-4bdd-452c-992...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01Z,TCGA-B0-4712-01A,Primary Tumor
3,TCGA-BP-4334-01Z-00-DX1.68cf009b-04da-4173-98d...,98967392-eeb4-4c7f-8578-1faf0e5dc3d6.rna_seq.a...,TCGA-BP-4334,TCGA-BP-4334-01Z,TCGA-BP-4334-01A,Primary Tumor
4,TCGA-CJ-6030-01Z-00-DX1.A762AB76-62E5-4680-991...,21273777-9444-4ea4-9e68-9ff6303e9997.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-01Z,TCGA-CJ-6030-01A,Primary Tumor
...,...,...,...,...,...,...
519,TCGA-BP-4165-01Z-00-DX1.1f8b54d9-1285-4226-a8f...,e328dc67-2ba2-408a-a137-75792cc02e52.rna_seq.a...,TCGA-BP-4165,TCGA-BP-4165-01Z,TCGA-BP-4165-01A,Primary Tumor
520,TCGA-B0-5081-01Z-00-DX1.2fed8120-8e9c-4a23-bd1...,4b04816b-80fa-410b-95d7-9b1f815b59e7.rna_seq.a...,TCGA-B0-5081,TCGA-B0-5081-01Z,TCGA-B0-5081-01A,Primary Tumor
521,TCGA-A3-3367-01Z-00-DX1.9fca6a91-bb71-4632-8b2...,acffe97d-bcaf-4c3c-877f-a48d9f5e81db.rna_seq.a...,TCGA-A3-3367,TCGA-A3-3367-01Z,TCGA-A3-3367-01A,Primary Tumor
522,TCGA-A3-3317-01Z-00-DX1.ca503755-bfa6-462b-b37...,0416ba1c-a9d5-4e25-a95b-7cb73bfeeb65.rna_seq.a...,TCGA-A3-3317,TCGA-A3-3317-01Z,TCGA-A3-3317-01A,Primary Tumor


In [8]:
metadata['data_type_info'] = metadata['image_path'].apply(lambda x: get_slide_orientation(x))
metadata['data_type_info'].value_counts()

data_type_info
DX    524
Name: count, dtype: int64

In [9]:
metadata = metadata[metadata.apply(lambda x: len(glob.glob(f"data/*/*/{x.rna_path}*")) > 0, axis=1)]
metadata = metadata[metadata.apply(lambda x: len(glob.glob(f"data/*/*/{x.image_path}*")) > 0, axis=1)]
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info
0,TCGA-B0-4815-01Z-00-DX1.dd230cfa-5952-4fe7-b73...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01Z,TCGA-B0-4815-01A,Primary Tumor,DX
1,TCGA-B0-5088-01Z-00-DX1.69bb79f8-33cc-4c9c-be6...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01Z,TCGA-B0-5088-01A,Primary Tumor,DX
2,TCGA-B0-4712-01Z-00-DX1.b584d650-4bdd-452c-992...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01Z,TCGA-B0-4712-01A,Primary Tumor,DX
3,TCGA-BP-4334-01Z-00-DX1.68cf009b-04da-4173-98d...,98967392-eeb4-4c7f-8578-1faf0e5dc3d6.rna_seq.a...,TCGA-BP-4334,TCGA-BP-4334-01Z,TCGA-BP-4334-01A,Primary Tumor,DX
4,TCGA-CJ-6030-01Z-00-DX1.A762AB76-62E5-4680-991...,21273777-9444-4ea4-9e68-9ff6303e9997.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-01Z,TCGA-CJ-6030-01A,Primary Tumor,DX
...,...,...,...,...,...,...,...
519,TCGA-BP-4165-01Z-00-DX1.1f8b54d9-1285-4226-a8f...,e328dc67-2ba2-408a-a137-75792cc02e52.rna_seq.a...,TCGA-BP-4165,TCGA-BP-4165-01Z,TCGA-BP-4165-01A,Primary Tumor,DX
520,TCGA-B0-5081-01Z-00-DX1.2fed8120-8e9c-4a23-bd1...,4b04816b-80fa-410b-95d7-9b1f815b59e7.rna_seq.a...,TCGA-B0-5081,TCGA-B0-5081-01Z,TCGA-B0-5081-01A,Primary Tumor,DX
521,TCGA-A3-3367-01Z-00-DX1.9fca6a91-bb71-4632-8b2...,acffe97d-bcaf-4c3c-877f-a48d9f5e81db.rna_seq.a...,TCGA-A3-3367,TCGA-A3-3367-01Z,TCGA-A3-3367-01A,Primary Tumor,DX
522,TCGA-A3-3317-01Z-00-DX1.ca503755-bfa6-462b-b37...,0416ba1c-a9d5-4e25-a95b-7cb73bfeeb65.rna_seq.a...,TCGA-A3-3317,TCGA-A3-3317-01Z,TCGA-A3-3317-01A,Primary Tumor,DX


In [10]:
is_magnif = metadata.apply(lambda x: has_magnif(glob.glob(f"data/*/*/{x.image_path}*")[0]), axis=1)
(~is_magnif).sum()

0

In [11]:
metadata = metadata[is_magnif]
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info
0,TCGA-B0-4815-01Z-00-DX1.dd230cfa-5952-4fe7-b73...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01Z,TCGA-B0-4815-01A,Primary Tumor,DX
1,TCGA-B0-5088-01Z-00-DX1.69bb79f8-33cc-4c9c-be6...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01Z,TCGA-B0-5088-01A,Primary Tumor,DX
2,TCGA-B0-4712-01Z-00-DX1.b584d650-4bdd-452c-992...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01Z,TCGA-B0-4712-01A,Primary Tumor,DX
3,TCGA-BP-4334-01Z-00-DX1.68cf009b-04da-4173-98d...,98967392-eeb4-4c7f-8578-1faf0e5dc3d6.rna_seq.a...,TCGA-BP-4334,TCGA-BP-4334-01Z,TCGA-BP-4334-01A,Primary Tumor,DX
4,TCGA-CJ-6030-01Z-00-DX1.A762AB76-62E5-4680-991...,21273777-9444-4ea4-9e68-9ff6303e9997.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-01Z,TCGA-CJ-6030-01A,Primary Tumor,DX
...,...,...,...,...,...,...,...
519,TCGA-BP-4165-01Z-00-DX1.1f8b54d9-1285-4226-a8f...,e328dc67-2ba2-408a-a137-75792cc02e52.rna_seq.a...,TCGA-BP-4165,TCGA-BP-4165-01Z,TCGA-BP-4165-01A,Primary Tumor,DX
520,TCGA-B0-5081-01Z-00-DX1.2fed8120-8e9c-4a23-bd1...,4b04816b-80fa-410b-95d7-9b1f815b59e7.rna_seq.a...,TCGA-B0-5081,TCGA-B0-5081-01Z,TCGA-B0-5081-01A,Primary Tumor,DX
521,TCGA-A3-3367-01Z-00-DX1.9fca6a91-bb71-4632-8b2...,acffe97d-bcaf-4c3c-877f-a48d9f5e81db.rna_seq.a...,TCGA-A3-3367,TCGA-A3-3367-01Z,TCGA-A3-3367-01A,Primary Tumor,DX
522,TCGA-A3-3317-01Z-00-DX1.ca503755-bfa6-462b-b37...,0416ba1c-a9d5-4e25-a95b-7cb73bfeeb65.rna_seq.a...,TCGA-A3-3317,TCGA-A3-3317-01Z,TCGA-A3-3317-01A,Primary Tumor,DX


In [12]:
metadata['id_pair'] = np.arange(len(metadata))

In [13]:
metadata.to_csv(f'data/metadata_{slide_type}.csv', index=False)