In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
from openslide import open_slide
import glob

In [2]:
def filter_files(x, slides):
    if slides == 'ffpe':
        if '-DX' in x or 'rna_seq' in x:
            return True
        else:
            return False
    elif slides == 'ff':
        if '-TS' in x or 'rna_seq' in x:
            return True
        else:
            return False
            
    return None


def has_magnif(image_path):
    img = open_slide(image_path)
    magnification = img.properties.get("aperio.AppMag", None)
    if magnification in ["40", "20"]:
        return True
    else:
        return False

def get_slide_orientation(x):
    if '-TS' in x:
        return 'TS'
    elif '-BS' in x:
        return 'BS'
    elif 'rna_seq' in x:
        return 'RNA'
    elif '-DX' in x:
        return 'DX'
    else:
        return np.nan 

In [3]:
slide_type = 'ff' # ff
sample_sheet = 'gdc_sample_sheet.2024-10-29'

In [4]:
df = pd.read_csv(f'data/{sample_sheet}.tsv', sep='\t')
df = df[df['File Name'].apply(lambda x: filter_files(x, slides=slide_type))]
df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
0,9a536afe-e38d-4270-bc1b-c0ea642cc4d8,TCGA-BP-4775-01A-01-TS1.e9eb861b-be79-49fd-a90...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-BP-4775,TCGA-BP-4775-01A,Primary Tumor
2,466888ef-7b80-471e-822c-bf355ef474b8,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4815,TCGA-B0-4815-01A,Primary Tumor
4,a5150292-3027-43db-aaab-c26d8fed9cc6,TCGA-BP-4342-11A-01-TS1.803c3f46-b96d-4288-ba1...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-BP-4342,TCGA-BP-4342-11A,Solid Tissue Normal
8,ebce6da8-e675-4212-8e5d-804e998fdc07,TCGA-B0-5088-01A-01-TS1.d6eb9c0d-c866-4473-862...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-5088,TCGA-B0-5088-01A,Primary Tumor
10,7f16d20f-07e2-4901-ae00-4da70125e8a3,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01A,Primary Tumor
...,...,...,...,...,...,...,...,...
2774,73d97d28-fb60-4380-8ba0-34e79da74af2,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CZ-5988,TCGA-CZ-5988-11A,Solid Tissue Normal
2779,f98a1953-c008-4ab9-a17e-be80b5fd43fe,e3b90eb7-f2a5-4b9e-a15e-ccfd7e6431d2.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-5104,TCGA-B0-5104-01A,Primary Tumor
2784,8e95a2b4-3110-4dd6-b3c9-b3e83c563282,c6dca0c8-bd9d-4299-9d18-a1054fe15aeb.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CW-6087,TCGA-CW-6087-01A,Primary Tumor
2785,7c90bf9b-74c9-459c-9cd7-944ea4535a6d,9124c3d4-eed6-44d5-b38a-ce8646537907.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CW-6087,TCGA-CW-6087-11A,Solid Tissue Normal


In [5]:
if slide_type == 'ffpe':
    paired_samples = df[['Case ID', 'Data Type']].value_counts().reset_index().value_counts('Case ID').reset_index()
elif slide_type == 'ff':
    paired_samples = df[['Sample ID', 'Data Type']].value_counts().reset_index().value_counts('Sample ID').reset_index()
paired_samples = paired_samples[paired_samples['count'] > 1]
paired_samples

Unnamed: 0,Sample ID,count
0,TCGA-B8-4620-01A,2
1,TCGA-B8-4620-11A,2
2,TCGA-B8-4621-01A,2
3,TCGA-CJ-5675-01A,2
4,TCGA-B8-4622-11A,2
...,...,...
531,TCGA-CZ-4863-01A,2
532,TCGA-B0-4824-01A,2
533,TCGA-CZ-4862-01A,2
534,TCGA-CZ-4863-11A,2


In [6]:
if slide_type == 'ffpe':
    df = df[df['Case ID'].isin(paired_samples['Case ID'].values)]
elif slide_type == 'ff':
    df = df[df['Sample ID'].isin(paired_samples['Sample ID'].values)]
df

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type
0,9a536afe-e38d-4270-bc1b-c0ea642cc4d8,TCGA-BP-4775-01A-01-TS1.e9eb861b-be79-49fd-a90...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-BP-4775,TCGA-BP-4775-01A,Primary Tumor
2,466888ef-7b80-471e-822c-bf355ef474b8,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4815,TCGA-B0-4815-01A,Primary Tumor
8,ebce6da8-e675-4212-8e5d-804e998fdc07,TCGA-B0-5088-01A-01-TS1.d6eb9c0d-c866-4473-862...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-5088,TCGA-B0-5088-01A,Primary Tumor
10,7f16d20f-07e2-4901-ae00-4da70125e8a3,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01A,Primary Tumor
15,0eef74fe-7d47-4abc-afb8-d98fea7e21b5,TCGA-B0-4712-01A-01-TS1.c7ac6556-e3c1-4bb2-9c2...,Biospecimen,Slide Image,TCGA-KIRC,TCGA-B0-4712,TCGA-B0-4712-01A,Primary Tumor
...,...,...,...,...,...,...,...,...
2774,73d97d28-fb60-4380-8ba0-34e79da74af2,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CZ-5988,TCGA-CZ-5988-11A,Solid Tissue Normal
2779,f98a1953-c008-4ab9-a17e-be80b5fd43fe,e3b90eb7-f2a5-4b9e-a15e-ccfd7e6431d2.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-B0-5104,TCGA-B0-5104-01A,Primary Tumor
2784,8e95a2b4-3110-4dd6-b3c9-b3e83c563282,c6dca0c8-bd9d-4299-9d18-a1054fe15aeb.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CW-6087,TCGA-CW-6087-01A,Primary Tumor
2785,7c90bf9b-74c9-459c-9cd7-944ea4535a6d,9124c3d4-eed6-44d5-b38a-ce8646537907.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,TCGA-KIRC,TCGA-CW-6087,TCGA-CW-6087-11A,Solid Tissue Normal


In [7]:
metadata = []
if slide_type == 'ffpe':
    iterate_over = 'Case ID'
elif slide_type == 'ff':
    iterate_over = 'Sample ID'

for iterate_over_id in df[iterate_over].unique():
    tab = df[df[iterate_over] == iterate_over_id]

    tab_rna = tab[tab['Data Type'] == 'Gene Expression Quantification']
    tab_slide = tab[tab['Data Type'] == 'Slide Image']
    

    for _, row_rna in tab_rna.iterrows():
        for _, row_slide in tab_slide.iterrows():
            if row_rna['Sample Type'] == row_slide['Sample Type']:
                metadata.append([row_slide['File Name'], row_rna['File Name'], row_rna['Case ID'], 
                                 row_slide['Sample ID'], row_rna['Sample ID'], row_slide['Sample Type']])
                
metadata = pd.DataFrame(metadata, columns=['image_path', 'rna_path', 'case_id', 'sample_slide_id', 'sample_rna_id', 'sample_type'])
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type
0,TCGA-BP-4775-01A-01-TS1.e9eb861b-be79-49fd-a90...,b87fe250-4800-4a9f-945e-7544057449d9.rna_seq.a...,TCGA-BP-4775,TCGA-BP-4775-01A,TCGA-BP-4775-01A,Primary Tumor
1,TCGA-B0-4815-01A-01-TS1.24541590-fdf9-4f7d-b6d...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01A,TCGA-B0-4815-01A,Primary Tumor
2,TCGA-B0-5088-01A-01-TS1.d6eb9c0d-c866-4473-862...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01A,TCGA-B0-5088-01A,Primary Tumor
3,TCGA-B0-4712-01A-01-TS1.c7ac6556-e3c1-4bb2-9c2...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01A,TCGA-B0-4712-01A,Primary Tumor
4,TCGA-CJ-6030-11A-01-TS1.9be20b43-0879-4510-877...,be992b6a-3878-46dd-8e18-9fd0ee7e7b00.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-11A,TCGA-CJ-6030-11A,Solid Tissue Normal
...,...,...,...,...,...,...
535,TCGA-B0-5706-01A-01-TS1.89d9608f-f7ce-4e04-a82...,d278e333-b5e0-4ce8-9511-6c31d29c8957.rna_seq.a...,TCGA-B0-5706,TCGA-B0-5706-01A,TCGA-B0-5706-01A,Primary Tumor
536,TCGA-B8-5553-01A-01-TS1.3ec5146b-9b14-4e53-b94...,297e8576-6af1-4f8d-b709-a14355728c0c.rna_seq.a...,TCGA-B8-5553,TCGA-B8-5553-01A,TCGA-B8-5553-01A,Primary Tumor
537,TCGA-CJ-5684-01A-01-TS1.33ab47f6-eb89-4b4b-a6d...,628efe45-6269-4c4b-8f50-15be89877383.rna_seq.a...,TCGA-CJ-5684,TCGA-CJ-5684-01A,TCGA-CJ-5684-01A,Primary Tumor
538,TCGA-CZ-5988-11A-01-TS1.365becdb-a9e2-4538-9e1...,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,TCGA-CZ-5988,TCGA-CZ-5988-11A,TCGA-CZ-5988-11A,Solid Tissue Normal


In [8]:
metadata['data_type_info'] = metadata['image_path'].apply(lambda x: get_slide_orientation(x))
metadata['data_type_info'].value_counts()

data_type_info
TS    540
Name: count, dtype: int64

In [9]:
metadata = metadata[metadata.apply(lambda x: len(glob.glob(f"data/*/*/{x.rna_path}*")) > 0, axis=1)]
metadata = metadata[metadata.apply(lambda x: len(glob.glob(f"data/*/*/{x.image_path}*")) > 0, axis=1)]
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info
0,TCGA-BP-4775-01A-01-TS1.e9eb861b-be79-49fd-a90...,b87fe250-4800-4a9f-945e-7544057449d9.rna_seq.a...,TCGA-BP-4775,TCGA-BP-4775-01A,TCGA-BP-4775-01A,Primary Tumor,TS
1,TCGA-B0-4815-01A-01-TS1.24541590-fdf9-4f7d-b6d...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01A,TCGA-B0-4815-01A,Primary Tumor,TS
2,TCGA-B0-5088-01A-01-TS1.d6eb9c0d-c866-4473-862...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01A,TCGA-B0-5088-01A,Primary Tumor,TS
3,TCGA-B0-4712-01A-01-TS1.c7ac6556-e3c1-4bb2-9c2...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01A,TCGA-B0-4712-01A,Primary Tumor,TS
4,TCGA-CJ-6030-11A-01-TS1.9be20b43-0879-4510-877...,be992b6a-3878-46dd-8e18-9fd0ee7e7b00.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-11A,TCGA-CJ-6030-11A,Solid Tissue Normal,TS
...,...,...,...,...,...,...,...
535,TCGA-B0-5706-01A-01-TS1.89d9608f-f7ce-4e04-a82...,d278e333-b5e0-4ce8-9511-6c31d29c8957.rna_seq.a...,TCGA-B0-5706,TCGA-B0-5706-01A,TCGA-B0-5706-01A,Primary Tumor,TS
536,TCGA-B8-5553-01A-01-TS1.3ec5146b-9b14-4e53-b94...,297e8576-6af1-4f8d-b709-a14355728c0c.rna_seq.a...,TCGA-B8-5553,TCGA-B8-5553-01A,TCGA-B8-5553-01A,Primary Tumor,TS
537,TCGA-CJ-5684-01A-01-TS1.33ab47f6-eb89-4b4b-a6d...,628efe45-6269-4c4b-8f50-15be89877383.rna_seq.a...,TCGA-CJ-5684,TCGA-CJ-5684-01A,TCGA-CJ-5684-01A,Primary Tumor,TS
538,TCGA-CZ-5988-11A-01-TS1.365becdb-a9e2-4538-9e1...,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,TCGA-CZ-5988,TCGA-CZ-5988-11A,TCGA-CZ-5988-11A,Solid Tissue Normal,TS


In [10]:
is_magnif = metadata.apply(lambda x: has_magnif(glob.glob(f"data/*/*/{x.image_path}*")[0]), axis=1)
(~is_magnif).sum()

7

In [11]:
metadata = metadata[is_magnif]
metadata

Unnamed: 0,image_path,rna_path,case_id,sample_slide_id,sample_rna_id,sample_type,data_type_info
0,TCGA-BP-4775-01A-01-TS1.e9eb861b-be79-49fd-a90...,b87fe250-4800-4a9f-945e-7544057449d9.rna_seq.a...,TCGA-BP-4775,TCGA-BP-4775-01A,TCGA-BP-4775-01A,Primary Tumor,TS
1,TCGA-B0-4815-01A-01-TS1.24541590-fdf9-4f7d-b6d...,c61e31b4-5c74-4d86-97cb-16be49b8a5b1.rna_seq.a...,TCGA-B0-4815,TCGA-B0-4815-01A,TCGA-B0-4815-01A,Primary Tumor,TS
2,TCGA-B0-5088-01A-01-TS1.d6eb9c0d-c866-4473-862...,4ab7d102-f5eb-4922-be66-9b516bebaf0a.rna_seq.a...,TCGA-B0-5088,TCGA-B0-5088-01A,TCGA-B0-5088-01A,Primary Tumor,TS
3,TCGA-B0-4712-01A-01-TS1.c7ac6556-e3c1-4bb2-9c2...,0cf6e899-af6e-44ac-a6c7-5f3fa68ea255.rna_seq.a...,TCGA-B0-4712,TCGA-B0-4712-01A,TCGA-B0-4712-01A,Primary Tumor,TS
4,TCGA-CJ-6030-11A-01-TS1.9be20b43-0879-4510-877...,be992b6a-3878-46dd-8e18-9fd0ee7e7b00.rna_seq.a...,TCGA-CJ-6030,TCGA-CJ-6030-11A,TCGA-CJ-6030-11A,Solid Tissue Normal,TS
...,...,...,...,...,...,...,...
535,TCGA-B0-5706-01A-01-TS1.89d9608f-f7ce-4e04-a82...,d278e333-b5e0-4ce8-9511-6c31d29c8957.rna_seq.a...,TCGA-B0-5706,TCGA-B0-5706-01A,TCGA-B0-5706-01A,Primary Tumor,TS
536,TCGA-B8-5553-01A-01-TS1.3ec5146b-9b14-4e53-b94...,297e8576-6af1-4f8d-b709-a14355728c0c.rna_seq.a...,TCGA-B8-5553,TCGA-B8-5553-01A,TCGA-B8-5553-01A,Primary Tumor,TS
537,TCGA-CJ-5684-01A-01-TS1.33ab47f6-eb89-4b4b-a6d...,628efe45-6269-4c4b-8f50-15be89877383.rna_seq.a...,TCGA-CJ-5684,TCGA-CJ-5684-01A,TCGA-CJ-5684-01A,Primary Tumor,TS
538,TCGA-CZ-5988-11A-01-TS1.365becdb-a9e2-4538-9e1...,fb312dd9-da8b-4190-999e-37cad1d683d5.rna_seq.a...,TCGA-CZ-5988,TCGA-CZ-5988-11A,TCGA-CZ-5988-11A,Solid Tissue Normal,TS


In [12]:
metadata['id_pair'] = np.arange(len(metadata))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['id_pair'] = np.arange(len(metadata))


In [13]:
metadata.to_csv(f'data/metadata_{slide_type}.csv', index=False)