## Split 'Bassez-cohort1' by patients

In [20]:
import scanpy as sc
import pandas as pd

dir_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/Bassez/Bassez_cohort1/export/BC_counts"
# Load the data into a Scanpy AnnData object
adata = sc.read(f'{dir_path}/matrix.mtx', cache=True)  # transpose the data

adata = adata.transpose()

# Correctly setting variable names (genes) and observation names (cells)
adata.var_names = open(f'{dir_path}/features.tsv').read().splitlines()
adata.obs_names = open(f'{dir_path}/barcodes.tsv').read().splitlines()



In [21]:
metadata_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/Bassez/1872-BIOKEY_metaData_cohort1_web.csv"
metadata_df = pd.read_csv(metadata_path)

# Replace 'YourIdentifierColumn' with the actual column name in metadata_df that matches adata.obs_names
metadata_df.set_index('Cell', inplace=True)

# Assuming 'adata' is your already loaded AnnData object
# Merge the metadata
adata.obs = adata.obs.join(metadata_df)

In [22]:
adata.var_names

Index(['A1BG', 'A1BG-AS1', 'A2M', 'A2M-AS1', 'A4GALT', 'AAAS', 'AACS', 'AADAC',
       'AADACL2-AS1', 'AADAT',
       ...
       'AL161752.1', 'AL355075.1', 'AL390038.1', 'AL606970.4', 'AL845331.2',
       'AVP', 'BPIFA1', 'OR14A16', 'PROKR1', 'SEMG2'],
      dtype='object', length=25288)

In [23]:
adata.obs_names

Index(['BIOKEY_13_Pre_AAACCTGCAACAACCT-1', 'BIOKEY_13_Pre_AAACCTGCAAGAAGAG-1',
       'BIOKEY_13_Pre_AAACCTGGTCTCCACT-1', 'BIOKEY_13_Pre_AAACCTGTCAACGAAA-1',
       'BIOKEY_13_Pre_AAACGGGAGAGTAAGG-1', 'BIOKEY_13_Pre_AAACGGGCACAGAGGT-1',
       'BIOKEY_13_Pre_AAACGGGCATGGGACA-1', 'BIOKEY_13_Pre_AAACGGGGTTCATGGT-1',
       'BIOKEY_13_Pre_AAACGGGTCAACGAAA-1', 'BIOKEY_13_Pre_AAACGGGTCACCAGGC-1',
       ...
       'BIOKEY_24_On_TTTGGTTGTATTACCG-1', 'BIOKEY_24_On_TTTGGTTGTCCGAATT-1',
       'BIOKEY_24_On_TTTGTCAAGCCACCTG-1', 'BIOKEY_24_On_TTTGTCAAGCGTAGTG-1',
       'BIOKEY_24_On_TTTGTCAAGGAATCGC-1', 'BIOKEY_24_On_TTTGTCAAGTTCGATC-1',
       'BIOKEY_24_On_TTTGTCACACGACTCG-1', 'BIOKEY_24_On_TTTGTCACACTCAGGC-1',
       'BIOKEY_24_On_TTTGTCAGTCGCGAAA-1', 'BIOKEY_24_On_TTTGTCATCGGTTAAC-1'],
      dtype='object', length=175942)

In [24]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,patient_id,timepoint,expansion,BC_type,cellType,cohort
BIOKEY_13_Pre_AAACCTGCAACAACCT-1,684,430,BIOKEY_13,Pre,,HER2+,Myeloid_cell,treatment_naive
BIOKEY_13_Pre_AAACCTGCAAGAAGAG-1,1252,700,BIOKEY_13,Pre,,HER2+,T_cell,treatment_naive
BIOKEY_13_Pre_AAACCTGGTCTCCACT-1,522,330,BIOKEY_13,Pre,,HER2+,pDC,treatment_naive
BIOKEY_13_Pre_AAACCTGTCAACGAAA-1,8454,2637,BIOKEY_13,Pre,,HER2+,Myeloid_cell,treatment_naive
BIOKEY_13_Pre_AAACGGGAGAGTAAGG-1,1612,874,BIOKEY_13,Pre,,HER2+,Fibroblast,treatment_naive
...,...,...,...,...,...,...,...,...
BIOKEY_24_On_TTTGTCAAGTTCGATC-1,474,312,BIOKEY_24,On,NE,ER+,Cancer_cell,treatment_naive
BIOKEY_24_On_TTTGTCACACGACTCG-1,666,334,BIOKEY_24,On,NE,ER+,Myeloid_cell,treatment_naive
BIOKEY_24_On_TTTGTCACACTCAGGC-1,9142,3332,BIOKEY_24,On,NE,ER+,Cancer_cell,treatment_naive
BIOKEY_24_On_TTTGTCAGTCGCGAAA-1,6228,1810,BIOKEY_24,On,NE,ER+,Fibroblast,treatment_naive


In [28]:
import os

# Assuming 'adata' is your AnnData object that already has metadata merged in it

# Unique patient numbers
unique_patient_number = adata.obs['patient_id'].unique()

# Directory path where you want to save the filtered AnnData objects
dir_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/Bassez/Bassez_cohort1/patient_specific_data"

# Check if the directory exists, if not, create it
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Loop through each unique patient number
for patient_number in unique_patient_number:
    # Filter the AnnData object for the current patient
    patient_specific_adata = adata[adata.obs['patient_id'] == patient_number, :]
    
    # Convert .obs and .var to string to avoid the TypeError during saving
    patient_specific_adata.obs = patient_specific_adata.obs.astype(str)
    patient_specific_adata.var = patient_specific_adata.var.astype(str)
    
    # Construct the file name based on the patient number
    file_name = f"patient_{patient_number}_adata.h5ad"
    file_path = os.path.join(dir_path, file_name)
    
    # Write the patient-specific AnnData object to a file
    patient_specific_adata.write(file_path)

    print(f"Saved {file_name}")

Saved patient_BIOKEY_13_adata.h5ad
Saved patient_BIOKEY_10_adata.h5ad
Saved patient_BIOKEY_16_adata.h5ad
Saved patient_BIOKEY_14_adata.h5ad
Saved patient_BIOKEY_19_adata.h5ad
Saved patient_BIOKEY_23_adata.h5ad
Saved patient_BIOKEY_26_adata.h5ad
Saved patient_BIOKEY_28_adata.h5ad
Saved patient_BIOKEY_3_adata.h5ad
Saved patient_BIOKEY_15_adata.h5ad
Saved patient_BIOKEY_8_adata.h5ad
Saved patient_BIOKEY_5_adata.h5ad
Saved patient_BIOKEY_30_adata.h5ad
Saved patient_BIOKEY_12_adata.h5ad
Saved patient_BIOKEY_1_adata.h5ad
Saved patient_BIOKEY_31_adata.h5ad
Saved patient_BIOKEY_20_adata.h5ad
Saved patient_BIOKEY_22_adata.h5ad
Saved patient_BIOKEY_25_adata.h5ad
Saved patient_BIOKEY_21_adata.h5ad
Saved patient_BIOKEY_29_adata.h5ad
Saved patient_BIOKEY_4_adata.h5ad
Saved patient_BIOKEY_9_adata.h5ad
Saved patient_BIOKEY_18_adata.h5ad
Saved patient_BIOKEY_11_adata.h5ad
Saved patient_BIOKEY_7_adata.h5ad
Saved patient_BIOKEY_2_adata.h5ad
Saved patient_BIOKEY_6_adata.h5ad
Saved patient_BIOKEY_17_adata

## Split 'Bassez-cohort2' by patients

In [29]:
import scanpy as sc
import pandas as pd

dir_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/Bassez/Bassez_cohort2/export/BC_counts"
# Load the data into a Scanpy AnnData object
adata = sc.read(f'{dir_path}/matrix.mtx', cache=True)  # transpose the data

adata = adata.transpose()

# Correctly setting variable names (genes) and observation names (cells)
adata.var_names = open(f'{dir_path}/features.tsv').read().splitlines()
adata.obs_names = open(f'{dir_path}/barcodes.tsv').read().splitlines()

In [30]:
metadata_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/Bassez/1871-BIOKEY_metaData_cohort2_web.csv"
metadata_df = pd.read_csv(metadata_path)

# Replace 'YourIdentifierColumn' with the actual column name in metadata_df that matches adata.obs_names
metadata_df.set_index('Cell', inplace=True)

# Assuming 'adata' is your already loaded AnnData object
# Merge the metadata
adata.obs = adata.obs.join(metadata_df)

In [31]:
adata.var_names

Index(['A1BG', 'A1BG-AS1', 'A2M', 'A2M-AS1', 'A2ML1', 'A4GALT', 'AAAS', 'AACS',
       'AAED1', 'AAGAB',
       ...
       'IQCJ', 'KRT4', 'LGI1', 'LINC01511', 'LINC02224', 'LINC02261', 'LRRC31',
       'TBC1D3D', 'TMC3', 'WFDC8'],
      dtype='object', length=22889)

In [32]:
adata.obs_names

Index(['BIOKEY_33_Pre_AAACCTGAGAGACTTA-1', 'BIOKEY_33_Pre_AAACCTGAGTAGCGGT-1',
       'BIOKEY_33_Pre_AAACCTGCATGGTAGG-1', 'BIOKEY_33_Pre_AAACCTGGTATAGGGC-1',
       'BIOKEY_33_Pre_AAACCTGGTCAGGACA-1', 'BIOKEY_33_Pre_AAACCTGGTGAGGGAG-1',
       'BIOKEY_33_Pre_AAACCTGTCCGTCATC-1', 'BIOKEY_33_Pre_AAACCTGTCTCGCATC-1',
       'BIOKEY_33_Pre_AAACCTGTCTTGTATC-1', 'BIOKEY_33_Pre_AAACGGGGTCCTCCAT-1',
       ...
       'BIOKEY_42_On_TTTCCTCCAGTAGAGC-1', 'BIOKEY_42_On_TTTGCGCAGTACGCGA-1',
       'BIOKEY_42_On_TTTGGTTAGAGCAATT-1', 'BIOKEY_42_On_TTTGGTTGTAATAGCA-1',
       'BIOKEY_42_On_TTTGGTTGTGGCCCTA-1', 'BIOKEY_42_On_TTTGGTTTCAAACCAC-1',
       'BIOKEY_42_On_TTTGGTTTCATTGCGA-1', 'BIOKEY_42_On_TTTGTCACAAACTGTC-1',
       'BIOKEY_42_On_TTTGTCACAAAGCGGT-1', 'BIOKEY_42_On_TTTGTCAGTAGTGAAT-1'],
      dtype='object', length=50693)

In [33]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,patient_id,timepoint,expansion,BC_type,cellType,cohort
BIOKEY_33_Pre_AAACCTGAGAGACTTA-1,3911,1665,BIOKEY_33,Pre,E,TNBC,T_cell,neoadjuvant_chemo
BIOKEY_33_Pre_AAACCTGAGTAGCGGT-1,605,491,BIOKEY_33,Pre,E,TNBC,Fibroblast,neoadjuvant_chemo
BIOKEY_33_Pre_AAACCTGCATGGTAGG-1,596,461,BIOKEY_33,Pre,E,TNBC,T_cell,neoadjuvant_chemo
BIOKEY_33_Pre_AAACCTGGTATAGGGC-1,2983,1615,BIOKEY_33,Pre,E,TNBC,Fibroblast,neoadjuvant_chemo
BIOKEY_33_Pre_AAACCTGGTCAGGACA-1,4098,1657,BIOKEY_33,Pre,E,TNBC,T_cell,neoadjuvant_chemo
...,...,...,...,...,...,...,...,...
BIOKEY_42_On_TTTGGTTTCAAACCAC-1,515,246,BIOKEY_42,On,NE,ER+,Fibroblast,neoadjuvant_chemo
BIOKEY_42_On_TTTGGTTTCATTGCGA-1,3477,1447,BIOKEY_42,On,NE,ER+,Fibroblast,neoadjuvant_chemo
BIOKEY_42_On_TTTGTCACAAACTGTC-1,5604,2131,BIOKEY_42,On,NE,ER+,Endothelial_cell,neoadjuvant_chemo
BIOKEY_42_On_TTTGTCACAAAGCGGT-1,8375,3148,BIOKEY_42,On,NE,ER+,Cancer_cell,neoadjuvant_chemo


In [34]:
import os

# Assuming 'adata' is your AnnData object that already has metadata merged in it

# Unique patient numbers
unique_patient_number = adata.obs['patient_id'].unique()

# Directory path where you want to save the filtered AnnData objects
dir_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/Bassez/Bassez_cohort2/patient_specific_data"

# Check if the directory exists, if not, create it
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Loop through each unique patient number
for patient_number in unique_patient_number:
    # Filter the AnnData object for the current patient
    patient_specific_adata = adata[adata.obs['patient_id'] == patient_number, :]
    
    # Convert .obs and .var to string to avoid the TypeError during saving
    patient_specific_adata.obs = patient_specific_adata.obs.astype(str)
    patient_specific_adata.var = patient_specific_adata.var.astype(str)
    
    # Construct the file name based on the patient number
    file_name = f"patient_{patient_number}_adata.h5ad"
    file_path = os.path.join(dir_path, file_name)
    
    # Write the patient-specific AnnData object to a file
    patient_specific_adata.write(file_path)

    print(f"Saved {file_name}")

Saved patient_BIOKEY_33_adata.h5ad
Saved patient_BIOKEY_38_adata.h5ad
Saved patient_BIOKEY_35_adata.h5ad
Saved patient_BIOKEY_36_adata.h5ad
Saved patient_BIOKEY_41_adata.h5ad
Saved patient_BIOKEY_32_adata.h5ad
Saved patient_BIOKEY_40_adata.h5ad
Saved patient_BIOKEY_37_adata.h5ad
Saved patient_BIOKEY_34_adata.h5ad
Saved patient_BIOKEY_39_adata.h5ad
Saved patient_BIOKEY_42_adata.h5ad


## Split 'Qian' by patients

In [1]:
import scanpy as sc
import pandas as pd

dir_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/qian/export/BC_counts"

# Load the data into a Scanpy AnnData object
adata = sc.read(f'{dir_path}/matrix.mtx', cache=True)  # transpose the data

adata = adata.transpose()

adata.var_names = open(f'{dir_path}/genes.tsv').read().splitlines()
adata.obs_names = open(f'{dir_path}/barcodes.tsv').read().splitlines()



In [2]:
metadata_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/qian/2103-Breastcancer_metadata.csv"
metadata_df = pd.read_csv(metadata_path)

# Replace 'YourIdentifierColumn' with the actual column name in metadata_df that matches adata.obs_names
metadata_df.set_index('Cell', inplace=True)

# Assuming 'adata' is your already loaded AnnData object
# Merge the metadata
adata.obs = adata.obs.join(metadata_df)

In [3]:
adata.var_names

Index(['RP11-34P13.3\tRP11-34P13.3', 'FAM138A\tFAM138A', 'OR4F5\tOR4F5',
       'RP11-34P13.7\tRP11-34P13.7', 'RP11-34P13.8\tRP11-34P13.8',
       'RP11-34P13.14\tRP11-34P13.14', 'RP11-34P13.9\tRP11-34P13.9',
       'FO538757.3\tFO538757.3', 'FO538757.2\tFO538757.2',
       'AP006222.2\tAP006222.2',
       ...
       'AC007325.2\tAC007325.2', 'BX072566.1\tBX072566.1',
       'AL354822.1\tAL354822.1', 'AC023491.2\tAC023491.2',
       'AC004556.1\tAC004556.1', 'AC233755.2\tAC233755.2',
       'AC233755.1\tAC233755.1', 'AC240274.1\tAC240274.1',
       'AC213203.1\tAC213203.1', 'FAM231B\tFAM231B'],
      dtype='object', length=33694)

In [4]:
adata.obs_names

Index(['sc5rJUQ024_AAACCTGCAACAACCT', 'sc5rJUQ024_AAACCTGCAAGAAGAG',
       'sc5rJUQ024_AAACCTGGTCTCCACT', 'sc5rJUQ024_AAACCTGTCAACGAAA',
       'sc5rJUQ024_AAACGGGAGAGTAAGG', 'sc5rJUQ024_AAACGGGCACAGAGGT',
       'sc5rJUQ024_AAACGGGCATGGGACA', 'sc5rJUQ024_AAACGGGGTTCATGGT',
       'sc5rJUQ024_AAACGGGTCAACGAAA', 'sc5rJUQ024_AAACGGGTCACCAGGC',
       ...
       'sc5rJUQ064_TTTGGTTTCATCATTC', 'sc5rJUQ064_TTTGGTTTCCTGCCAT',
       'sc5rJUQ064_TTTGGTTTCTCAAACG', 'sc5rJUQ064_TTTGGTTTCTCTTATG',
       'sc5rJUQ064_TTTGGTTTCTGCCCTA', 'sc5rJUQ064_TTTGTCAAGCACCGCT',
       'sc5rJUQ064_TTTGTCAAGCCAGAAC', 'sc5rJUQ064_TTTGTCAAGGACGAAA',
       'sc5rJUQ064_TTTGTCAGTCTTGTCC', 'sc5rJUQ064_TTTGTCATCCGGGTGT'],
      dtype='object', length=44024)

In [5]:
adata.obs

Unnamed: 0,nGene,nUMI,CellFromTumor,PatientNumber,TumorType,TumorSite,CellType
sc5rJUQ024_AAACCTGCAACAACCT,894,1624,True,41,BC,Biopsy,DC
sc5rJUQ024_AAACCTGCAAGAAGAG,1401,2983,True,41,BC,Biopsy,T_cell
sc5rJUQ024_AAACCTGGTCTCCACT,585,1141,True,41,BC,Biopsy,Cancer
sc5rJUQ024_AAACCTGTCAACGAAA,4233,18953,True,41,BC,Biopsy,Myeloid
sc5rJUQ024_AAACGGGAGAGTAAGG,1669,3794,True,41,BC,Biopsy,Fibroblast
...,...,...,...,...,...,...,...
sc5rJUQ064_TTTGTCAAGCACCGCT,287,454,True,54,BC,Biopsy,Cancer
sc5rJUQ064_TTTGTCAAGCCAGAAC,3037,11414,True,54,BC,Biopsy,Cancer
sc5rJUQ064_TTTGTCAAGGACGAAA,1858,4750,True,54,BC,Biopsy,Cancer
sc5rJUQ064_TTTGTCAGTCTTGTCC,2537,10251,True,54,BC,Biopsy,Cancer


In [7]:
import os

# Assuming 'adata' is your AnnData object that already has metadata merged in it

# Unique patient numbers
unique_patient_numbers = adata.obs['PatientNumber'].unique()

# Directory path where you want to save the filtered AnnData objects
dir_path = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/qian/patient_specific_data"

# Check if the directory exists, if not, create it
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Loop through each unique patient number
for patient_number in unique_patient_numbers:
    # Filter the AnnData object for the current patient
    patient_specific_adata = adata[adata.obs['PatientNumber'] == patient_number, :]
    
    # Construct the file name based on the patient number
    file_name = f"patient_{patient_number}_adata.h5ad"
    file_path = os.path.join(dir_path, file_name)
    
    # Write the patient-specific AnnData object to a file
    patient_specific_adata.write(file_path)

    print(f"Saved {file_name}")


Saved patient_41_adata.h5ad
Saved patient_42_adata.h5ad
Saved patient_43_adata.h5ad
Saved patient_44_adata.h5ad
Saved patient_45_adata.h5ad
Saved patient_46_adata.h5ad
Saved patient_47_adata.h5ad
Saved patient_48_adata.h5ad
Saved patient_49_adata.h5ad
Saved patient_50_adata.h5ad
Saved patient_51_adata.h5ad
Saved patient_52_adata.h5ad
Saved patient_53_adata.h5ad
Saved patient_54_adata.h5ad


In [1]:
import glob
import scanpy as sc

datasets = []
for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/breast_cancer/qian/patient_specific_data/*.h5ad"):
    datasets.append(dataset_file)


adatas = []

datasets = sorted(datasets)
for data in datasets:
    adata = sc.read_h5ad(data)
    adatas.append(adata)

In [3]:
for adata, dataset in zip(adatas, datasets):
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print("#########################################################################")
#     print(f"Dataset: {dataset}")
#     print("Number of genes:", n_genes)
#     print("Number of cells:", n_cells)
#     print(adata.obs.head())
    print()
    print("#####################################################################################")
    print(f"#########################################{dataset.split('/')[-1]}###############################################")
    print()
    print(adata.obs['CellType'].value_counts().head(50))
    print()  


#########################################################################
#########################################################################

#####################################################################################
#########################################patient_41_adata.h5ad###############################################

CellType
T_cell        1975
Fibroblast     958
Cancer         361
EC             278
B_cell         214
DC              61
Myeloid         59
Mast            15
Name: count, dtype: int64


#########################################################################
#########################################################################

#####################################################################################
#########################################patient_42_adata.h5ad###############################################

CellType
Cancer        2488
T_cell        1486
Fibroblast     579
B_cell         122
Myeloid         97
EC            