# Random Patient Selection

Because of the large data size, smaller balanced sample of glioblastoma and astrocytoma patients was randomly selected.

This script creates `UCSF-PDGM-metadata_updated.csv`, a modified metadata file containing information, if the patient was randomly selected or not.

In [1]:
import pandas as pd

In [3]:
metadata_file = r"../UCSF-PDGM-metadata_v2.csv"

metadata_df = pd.read_csv(metadata_file)

In [4]:
# Run this code for new data
"""
id_nums_to_ignore = ["315", "278", "175", "138", "181", "289"]
ids_to_ignore = ["UCSF-PDGM-" + _id for _id in id_nums_to_ignore]
metadata_df = metadata_df[~metadata_df["ID"].str.replace("UCSF-PDGM-", "").isin(id_nums_to_ignore)]

assert len(metadata_df) == 495

"""

'\nid_nums_to_ignore = ["315", "278", "175", "138", "181", "289"]\nids_to_ignore = ["UCSF-PDGM-" + _id for _id in id_nums_to_ignore]\nmetadata_df = metadata_df[~metadata_df["ID"].str.replace("UCSF-PDGM-", "").isin(id_nums_to_ignore)]\n\nassert len(metadata_df) == 495\n\n'

In [5]:
gbma = metadata_df[metadata_df["Final pathologic diagnosis (WHO 2021)"] == "Glioblastoma, IDH-wildtype"]
astro = metadata_df[metadata_df["Final pathologic diagnosis (WHO 2021)"] == "Astrocytoma, IDH-mutant"]


## Patient sampling


In [6]:
astro_samples = 30
gbma_samples = astro_samples

In [7]:
sampled_gbma = gbma.sample(n=gbma_samples + 1, random_state=1)
sampled_astro = astro.sample(n=astro_samples, random_state=1)

sampled_gbma = sampled_gbma[~sampled_gbma["ID"].str.contains("289")]


In [13]:
metadata_df["used"] = False
metadata_df.loc[sampled_gbma.index, "used"] = True
metadata_df.loc[sampled_astro.index, "used"] = True


print(metadata_df.loc[sampled_astro.index])

metadata_df.to_csv("../UCSF-PDGM-metadata_updated.csv", index=None)

                ID Sex  Age at MRI  WHO CNS Grade  \
291  UCSF-PDGM-331   M          46              3   
435  UCSF-PDGM-476   F          22              2   
238  UCSF-PDGM-272   M          35              3   
269  UCSF-PDGM-308   M          45              2   
488  UCSF-PDGM-529   M          34              4   
428  UCSF-PDGM-469   F          43              3   
367  UCSF-PDGM-407   M          42              4   
265  UCSF-PDGM-304   M          52              3   
251  UCSF-PDGM-285   F          60              3   
228  UCSF-PDGM-261   F          29              2   
229  UCSF-PDGM-262   M          44              3   
217  UCSF-PDGM-250   M          44              2   
403  UCSF-PDGM-444   M          32              2   
209  UCSF-PDGM-242   M          37              2   
226  UCSF-PDGM-259   M          28              2   
400  UCSF-PDGM-441   F          26              3   
114  UCSF-PDGM-132   M          29              4   
38   UCSF-PDGM-044   M          38            

In [14]:
gbma_ids = sampled_gbma.sort_values("ID")["ID"].str.replace("UCSF-PDGM-","").unique()
gbma_ids

array(['009', '010', '011', '079', '083', '085', '113', '136', '163',
       '185', '198', '207', '212', '287', '318', '341', '344', '383',
       '398', '422', '426', '461', '466', '492', '497', '503', '511',
       '524', '538', '539'], dtype=object)

In [15]:
astro_ids = sampled_astro.sort_values("ID")["ID"].str.replace("UCSF-PDGM-","").unique()
astro_ids

array(['044', '132', '232', '242', '250', '251', '252', '256', '259',
       '261', '262', '272', '282', '285', '304', '305', '307', '308',
       '331', '407', '427', '439', '441', '444', '445', '469', '476',
       '499', '529', '531'], dtype=object)

In [16]:
# Check that segmentation directories for all selected patients are present
import os

all_files = os.listdir("../data/raw")

ids = [x.replace("UCSF-PDGM-", "").replace("_nifti", "")[1:] for x in all_files if "UCSF-PDGM-" in x]
[x for x in list(astro_ids) + list(gbma_ids) if x not in ids]

# should return an empty array

[]