In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
from openslide import open_slide
import glob

In [2]:
def filter_files(x, slides):
    if slides == 'ffpe':
        if '-DX' in x or 'rna_seq' in x:
            return True
        else:
            return False
    elif slides == 'ff':
        if '-TS' in x or 'rna_seq' in x:
            return True
        else:
            return False
            
    return None


def has_magnif(image_path):
    img = open_slide(image_path)
    magnification = img.properties.get("aperio.AppMag", None)
    if magnification in ["40", "20"]:
        return True
    else:
        return False

def get_slide_orientation(x):
    if '-TS' in x:
        return 'TS'
    elif '-BS' in x:
        return 'BS'
    elif 'rna_seq' in x:
        return 'RNA'
    elif '-DX' in x:
        return 'DX'
    else:
        return np.nan 

In [3]:
slide_type = 'ffpe' # ff
sample_sheet = 'gdc_sample_sheet.2024-08-01'

In [4]:
df = pd.read_csv(f'data/{sample_sheet}.tsv', sep='\t')
df = df[df['File Name'].apply(lambda x: filter_files(x, slides=slide_type))]
df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
0,080d3a20-1685-4a9a-904a-749a9d92da0c,88c46445-0117-4051-829d-202a13d08fe5.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-ER-A199,TCGA-ER-A199-06A,Metastatic
1,c8e9936f-4ff9-4c80-86de-f0f5f10cfc45,TCGA-EE-A3J5-06Z-00-DX1.47202780-2B18-4661-AD4...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-EE-A3J5,TCGA-EE-A3J5-06Z,Metastatic
3,d076714e-3a5a-479e-81d5-2a377d3e37ea,05d365ce-b1d0-46af-b74a-b254f2c7de8e.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-EE-A3J5,TCGA-EE-A3J5-06A,Metastatic
5,94f73ca7-d3e5-412f-ad44-00256eb80065,6dfa748c-af43-469f-a98b-8472a4132739.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-EE-A2ME,TCGA-EE-A2ME-06A,Metastatic
7,8ffd5617-149a-479e-a7d3-81c49c067fee,5467227e-26db-4241-9b21-0dd8ee39fcee.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-BF-A5ES,TCGA-BF-A5ES-01A,Primary Tumor
...,...,...,...,...,...,...,...,...
1416,32c1be5e-e833-43f5-a515-75b6e54dbefb,22e9b9d1-ac71-43f5-8606-f9f53c6ecc08.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-ER-A2NB,TCGA-ER-A2NB-01A,Primary Tumor
1417,930d620c-24ae-4f3b-9dd5-01cb08735763,TCGA-ER-A2NH-01Z-00-DX1.8E2DEC74-D875-4420-B57...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-ER-A2NH,TCGA-ER-A2NH-01Z,Primary Tumor
1418,98b164c8-e171-4344-9822-04088aed7705,fe4a0710-a6db-4735-9d1a-113df310def6.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-FS-A4F0,TCGA-FS-A4F0-06A,Metastatic
1419,969cf1c6-52d1-4353-b712-68c1c5e0d051,TCGA-GN-A262-01Z-00-DX1.64FB584F-24C0-4311-A75...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-GN-A262,TCGA-GN-A262-01Z,Primary Tumor


In [5]:
if slide_type == 'ffpe':
    paired_samples = df[['Case ID', 'Data Type']].value_counts().reset_index().value_counts('Case ID').reset_index()
elif slide_type == 'ff':
    paired_samples = df[['Sample ID', 'Data Type']].value_counts().reset_index().value_counts('Sample ID').reset_index()
paired_samples = paired_samples[paired_samples['count'] > 1]
paired_samples

Unnamed: 0,Case ID,count
0,TCGA-Z2-AA3V,2
1,TCGA-3N-A9WB,2
2,TCGA-3N-A9WC,2
3,TCGA-3N-A9WD,2
4,TCGA-BF-A1PU,2
...,...,...
427,TCGA-BF-AAP1,2
428,TCGA-BF-AAP2,2
429,TCGA-BF-AAP4,2
430,TCGA-BF-AAP6,2


In [6]:
if slide_type == 'ffpe':
    df = df[df['Case ID'].isin(paired_samples['Case ID'].values)]
elif slide_type == 'ff':
    df = df[df['Sample ID'].isin(paired_samples['Sample ID'].values)]
df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
1,c8e9936f-4ff9-4c80-86de-f0f5f10cfc45,TCGA-EE-A3J5-06Z-00-DX1.47202780-2B18-4661-AD4...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-EE-A3J5,TCGA-EE-A3J5-06Z,Metastatic
3,d076714e-3a5a-479e-81d5-2a377d3e37ea,05d365ce-b1d0-46af-b74a-b254f2c7de8e.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-EE-A3J5,TCGA-EE-A3J5-06A,Metastatic
5,94f73ca7-d3e5-412f-ad44-00256eb80065,6dfa748c-af43-469f-a98b-8472a4132739.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-EE-A2ME,TCGA-EE-A2ME-06A,Metastatic
8,90cb69cb-502b-429e-be32-6378cb5d1bee,102d912c-5882-4ae4-9eaf-cbd2f55f66af.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-D3-A8GE,TCGA-D3-A8GE-06A,Metastatic
9,3e0e36e7-2388-4076-9f0d-cc4462a1cec2,TCGA-DA-A1I5-01Z-00-DX1.ECAD6ABE-503B-4C8F-A96...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-DA-A1I5,TCGA-DA-A1I5-01Z,Primary Tumor
...,...,...,...,...,...,...,...,...
1415,95e466b9-f09c-42e1-b23c-844aa3e644c7,8ed1ae0f-90db-445c-9805-f8ae9b55da80.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-ER-A2NG,TCGA-ER-A2NG-06A,Metastatic
1416,32c1be5e-e833-43f5-a515-75b6e54dbefb,22e9b9d1-ac71-43f5-8606-f9f53c6ecc08.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-SKCM,TCGA-ER-A2NB,TCGA-ER-A2NB-01A,Primary Tumor
1417,930d620c-24ae-4f3b-9dd5-01cb08735763,TCGA-ER-A2NH-01Z-00-DX1.8E2DEC74-D875-4420-B57...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-ER-A2NH,TCGA-ER-A2NH-01Z,Primary Tumor
1419,969cf1c6-52d1-4353-b712-68c1c5e0d051,TCGA-GN-A262-01Z-00-DX1.64FB584F-24C0-4311-A75...,Biospecimen,Slide Image,TCGA-SKCM,TCGA-GN-A262,TCGA-GN-A262-01Z,Primary Tumor


In [7]:
metadata = []
if slide_type == 'ffpe':
    iterate_over = 'Case ID'
elif slide_type == 'ff':
    iterate_over = 'Sample ID'

for iterate_over_id in df[iterate_over].unique():
    tab = df[df[iterate_over] == iterate_over_id]

    tab_rna = tab[tab['Data Type'] == 'Gene Expression Quantification']
    tab_slide = tab[tab['Data Type'] == 'Slide Image']
    

    for _, row_rna in tab_rna.iterrows():
        for _, row_slide in tab_slide.iterrows():
            if row_rna['Sample Type'] == row_slide['Sample Type']:
                metadata.append([row_slide['File Name'], row_rna['File Name'], row_rna['Case ID'], 
                                 row_slide['Sample ID'], row_rna['Sample ID'], row_slide['Sample Type']])
                
metadata = pd.DataFrame(metadata, columns=['image_path', 'rna_path', 'case_id', 'sample_slide_id', 'sample_rna_id', 'sample_type'])
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type
0,TCGA-EE-A3J5-06Z-00-DX1.47202780-2B18-4661-AD4...,05d365ce-b1d0-46af-b74a-b254f2c7de8e.rna_seq.a...,TCGA-EE-A3J5,TCGA-EE-A3J5-06Z,TCGA-EE-A3J5-06A,Metastatic
1,TCGA-D3-A8GE-06Z-00-DX1.757AE9F7-823E-4167-B12...,102d912c-5882-4ae4-9eaf-cbd2f55f66af.rna_seq.a...,TCGA-D3-A8GE,TCGA-D3-A8GE-06Z,TCGA-D3-A8GE-06A,Metastatic
2,TCGA-D3-A1Q4-06Z-00-DX1.6AD7BBBD-BB47-4D71-B46...,ee3e7969-ff2a-4b27-8f15-6cb46eed7c61.rna_seq.a...,TCGA-D3-A1Q4,TCGA-D3-A1Q4-06Z,TCGA-D3-A1Q4-06A,Metastatic
3,TCGA-D3-A2JF-06Z-00-DX1.1AD134CC-6844-45CC-BEC...,39927c92-75d7-424c-9bcd-63def0075a1d.rna_seq.a...,TCGA-D3-A2JF,TCGA-D3-A2JF-06Z,TCGA-D3-A2JF-06A,Metastatic
4,TCGA-EB-A3XF-01Z-00-DX1.D381AAB4-242B-45E1-B6A...,393a470f-8641-493a-98fe-e0ec38c8d7f9.rna_seq.a...,TCGA-EB-A3XF,TCGA-EB-A3XF-01Z,TCGA-EB-A3XF-01A,Primary Tumor
...,...,...,...,...,...,...
271,TCGA-XV-AAZW-01Z-00-DX1.26C215F6-0EFA-42D9-A3E...,0e8bcafe-b201-4c66-8a0b-d89a4fd7c7ae.rna_seq.a...,TCGA-XV-AAZW,TCGA-XV-AAZW-01Z,TCGA-XV-AAZW-01A,Primary Tumor
272,TCGA-EB-A3XD-01Z-00-DX1.B0C8A3FE-21A5-4807-934...,7645a3b1-639d-4677-8bcf-c16df24fcc2b.rna_seq.a...,TCGA-EB-A3XD,TCGA-EB-A3XD-01Z,TCGA-EB-A3XD-01A,Primary Tumor
273,TCGA-ER-A2NB-01Z-00-DX1.323F02C6-D07A-41B6-BEE...,22e9b9d1-ac71-43f5-8606-f9f53c6ecc08.rna_seq.a...,TCGA-ER-A2NB,TCGA-ER-A2NB-01Z,TCGA-ER-A2NB-01A,Primary Tumor
274,TCGA-D3-A3C8-06Z-00-DX1.FE6A00E4-C1B4-42D8-9B0...,4ff81b7f-2dbc-453f-a68b-fe40315b2ef7.rna_seq.a...,TCGA-D3-A3C8,TCGA-D3-A3C8-06Z,TCGA-D3-A3C8-06A,Metastatic


In [8]:
metadata['data_type_info'] = metadata['image_path'].apply(lambda x: get_slide_orientation(x))
metadata['data_type_info'].value_counts()

data_type_info
DX    276
Name: count, dtype: int64

In [None]:
metadata = metadata[metadata.apply(lambda x: len(glob.glob(f"data/*/*/{x.rna_path}*")) > 0, axis=1)]
metadata = metadata[metadata.apply(lambda x: len(glob.glob(f"data/*/*/{x.image_path}*")) > 0, axis=1)]
metadata

In [None]:
is_magnif = metadata.apply(lambda x: has_magnif(glob.glob(f"data/*/*/{x.image_path}*")[0]), axis=1)
(~is_magnif).sum()

In [None]:
metadata = metadata[is_magnif]
metadata

In [None]:
metadata['id_pair'] = np.arange(len(metadata))

In [None]:
metadata.to_csv(f'data/metadata_{slide_type}.csv', index=False)

In [14]:
1

1