# Load the clinical data file and keep only the necessary columns

In [9]:
import pandas as pd
import os

dataPath = '../Data'
dataFile = os.path.join(dataPath, '8162d394-8b64-4da2-9f5b-d164c54b9608', 'nationwidechildrens.org_clinical_patient_brca.txt')

df = pd.read_csv(filepath_or_buffer=dataFile, sep='\t', skiprows=[1,2])
# Note: The clinical file has a three-line header; the second and third lines are not needed for our pusposes
df[['bcr_patient_uuid', 'er_status_by_ihc', 'pr_status_by_ihc', 'her2_status_by_ihc']] # Expected output: 1097 rows

Unnamed: 0,bcr_patient_uuid,er_status_by_ihc,pr_status_by_ihc,her2_status_by_ihc
0,6E7D5EC6-A469-467C-B748-237353C23416,Positive,Positive,Negative
1,55262FCB-1B01-4480-B322-36570430C917,Positive,Positive,Positive
2,427D0648-3F77-4FFC-B52C-89855426D647,Positive,Positive,Indeterminate
3,C31900A4-5DCD-4022-97AC-638E86E889E4,Positive,Positive,Positive
4,6623FC5E-00BE-4476-967A-CBD55F676EA6,Positive,Positive,Equivocal
...,...,...,...,...
1092,5CD79093-1571-4F71-8136-0D84CCABDCAC,Positive,Positive,Negative
1093,F89588E9-CA73-4465-A7FB-7246EDB45E3A,Positive,Positive,Negative
1094,CA20249F-B7EA-4FD9-9ECB-34F74755AE35,Positive,Positive,Negative
1095,23F438BD-1DBB-4D46-972F-1E8E74DDBD37,Positive,Positive,Negative


## Determine Triple Negative status and drop indeterminable cases

In [10]:
triple_negative = (df[['er_status_by_ihc', 'pr_status_by_ihc', 'her2_status_by_ihc']] == 'Negative').all(axis=1)
any_positive = (df[['er_status_by_ihc', 'pr_status_by_ihc', 'her2_status_by_ihc']] == 'Positive').any(axis=1)
df['tnbc'] = pd.NA
df.loc[triple_negative, 'tnbc'] = True
df.loc[any_positive, 'tnbc'] = False
# df[df.isna().any(axis=1)] # Shows indeterminable cases: 118 rows x 5 columns
df.dropna(inplace=True) # Drop indeterminable cases
df['tnbc'].value_counts() # Expected tnbc values: False 863, True 116

tnbc
False    863
True     116
Name: count, dtype: int64

## Link all cases to the corresponding RNA sequencing data file using the metadata

### 2 cases have no RNA files; 1 TNBC case, 1 nTNBC case

In [13]:
import json

with open(os.path.join(dataPath, 'metadata.cart.2025-04-01.json'), 'r') as f:
    data = json.load(f)

rna_files = {
    entity['case_id'].upper(): os.path.join(file['file_id'], file['file_name'])
    for file in data
    if 'experimental_strategy' in file and file['experimental_strategy'] == 'RNA-Seq' and 'associated_entities' in file and len(file['associated_entities']) > 0
    for entity in file['associated_entities']
} # case_id in uppercase equals df['bcr_patient_uuid']; file is found in folder with its 'file_id', under its 'file_name'
rna_files

df['file'] = df['bcr_patient_uuid'].str.upper().map(rna_files)
df['exists'] = df['file'].apply(lambda file: not pd.isna(file) and os.path.exists(os.path.join(dataPath, file)))
# df[df['exists'] == False] # Shows cases with missing RNA files: 2 rows
df = df[df['exists'] == True]
df.to_csv(os.path.join(dataPath, 'clinical.csv'), index=False)
print(df['tnbc'].value_counts())
df[['bcr_patient_uuid', 'tnbc', 'file']] # Expected output: 977 rows

tnbc
False    862
True     115
Name: count, dtype: int64


Unnamed: 0,bcr_patient_uuid,tnbc,file
0,6E7D5EC6-A469-467C-B748-237353C23416,False,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...
1,55262FCB-1B01-4480-B322-36570430C917,False,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...
2,427D0648-3F77-4FFC-B52C-89855426D647,False,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...
3,C31900A4-5DCD-4022-97AC-638E86E889E4,False,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...
4,6623FC5E-00BE-4476-967A-CBD55F676EA6,False,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...
...,...,...,...
1092,5CD79093-1571-4F71-8136-0D84CCABDCAC,False,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...
1093,F89588E9-CA73-4465-A7FB-7246EDB45E3A,False,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...
1094,CA20249F-B7EA-4FD9-9ECB-34F74755AE35,False,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...
1095,23F438BD-1DBB-4D46-972F-1E8E74DDBD37,False,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...
