In [3]:
import scanpy as sc
from scipy.sparse import csr_matrix
import scipy
import anndata
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

## First search

In [1]:
import os
import requests

# List of URLs to download
input_adata_file = ["https://zenodo.org/records/7041849/files/SchiebingerLander2019_GSE115943.h5ad",
"https://zenodo.org/records/7041849/files/NormanWeissman2019_filtered.h5ad",
"https://zenodo.org/records/7041849/files/McFarlandTsherniak2020.h5ad",
"https://zenodo.org/records/7041849/files/GehringPachter2019.h5ad", 
"https://zenodo.org/records/7041849/files/PapalexiSatija2021_eccite_arrayed_protein.h5ad", "https://zenodo.org/records/7041849/files/PapalexiSatija2021_eccite_arrayed_RNA.h5ad", "https://zenodo.org/records/7041849/files/PapalexiSatija2021_eccite_protein.h5ad",
"https://zenodo.org/records/7041849/files/GasperiniShendure2019_lowMOI.h5ad",
"https://zenodo.org/records/7041849/files/GasperiniShendure2019_highMOI.h5ad",
"https://zenodo.org/records/7041849/files/GasperiniShendure2019_atscale.h5ad",
"https://zenodo.org/records/7041849/files/PapalexiSatija2021_eccite_RNA.h5ad",
"https://zenodo.org/records/7041849/files/FrangiehIzar2021_RNA.h5ad",
"https://zenodo.org/records/7041849/files/FrangiehIzar2021_protein.h5ad",
"https://zenodo.org/records/7041849/files/ReplogleWeissman2022_K562_essential.h5ad",
"https://zenodo.org/records/7041849/files/DixitRegev2016.h5ad",
"https://zenodo.org/records/7041849/files/DatlingerBock2021.h5ad",
"https://zenodo.org/records/7041849/files/ReplogleWeissman2022_K562_gwps.h5ad",
"https://zenodo.org/records/7041849/files/ReplogleWeissman2022_rpe1.h5ad",
"https://zenodo.org/records/7041849/files/SchiebingerLander2019_GSE106340.h5ad",
"https://zenodo.org/records/7041849/files/TianKampmann2021_CRISPRa.h5ad",
"https://zenodo.org/records/7041849/files/DatlingerBock2017.h5ad",
"https://zenodo.org/records/7041849/files/SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_11_screen.h5ad",
"https://zenodo.org/records/7041849/files/SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen.h5ad",
"https://zenodo.org/records/7041849/files/ShifrutMarson2018.h5ad",
"https://zenodo.org/records/7041849/files/ChangYe2021.h5ad",
"https://zenodo.org/records/7041849/files/SrivatsanTrapnell2020_sciplex2.h5ad",
"https://zenodo.org/records/7041849/files/AissaBenevolenskaya2021.h5ad",
"https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
"https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad",
"https://zenodo.org/records/7041849/files/SrivatsanTrapnell2020_sciplex3.h5ad",
"https://zenodo.org/records/7041849/files/SrivatsanTrapnell2020_sciplex4.h5ad",
"https://zenodo.org/records/7041849/files/TianKampmann2019_day7neuron.h5ad",
"https://zenodo.org/records/7041849/files/ZhaoSims2021.h5ad",
"https://zenodo.org/records/7041849/files/XieHon2017.h5ad",
"https://zenodo.org/records/7041849/files/WeinrebKlein2020.h5ad",
"https://zenodo.org/records/7041849/files/TianKampmann2021_CRISPRi.h5ad",
"https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
"https://zenodo.org/records/7041849/files/TianKampmann2019_iPSC.h5ad"]

# The directory where you want to save the files
output_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop over the list of URLs
for url in input_adata_file:
    # Extract the file name from the URL
    file_name = url.split('/')[-1]
    # Define the path to save the file
    output_path = os.path.join(output_dir, file_name)
    
    # Print the file being downloaded (optional)
    print(f"Downloading {file_name}...")
    
    # Make the request to download the file
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Write the content to the file
        with open(output_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file_name} successfully.")
    else:
        print(f"Failed to download {file_name}. Status code: {response.status_code}")

print("All files have been downloaded.")


Downloading SchiebingerLander2019_GSE115943.h5ad...
Downloaded SchiebingerLander2019_GSE115943.h5ad successfully.
Downloading NormanWeissman2019_filtered.h5ad...
Downloaded NormanWeissman2019_filtered.h5ad successfully.
Downloading McFarlandTsherniak2020.h5ad...
Downloaded McFarlandTsherniak2020.h5ad successfully.
Downloading GehringPachter2019.h5ad...
Downloaded GehringPachter2019.h5ad successfully.
Downloading PapalexiSatija2021_eccite_arrayed_protein.h5ad...
Downloaded PapalexiSatija2021_eccite_arrayed_protein.h5ad successfully.
Downloading PapalexiSatija2021_eccite_arrayed_RNA.h5ad...
Downloaded PapalexiSatija2021_eccite_arrayed_RNA.h5ad successfully.
Downloading PapalexiSatija2021_eccite_protein.h5ad...
Downloaded PapalexiSatija2021_eccite_protein.h5ad successfully.
Downloading GasperiniShendure2019_lowMOI.h5ad...
Downloaded GasperiniShendure2019_lowMOI.h5ad successfully.
Downloading GasperiniShendure2019_highMOI.h5ad...
Downloaded GasperiniShendure2019_highMOI.h5ad successfully.


In [8]:
adata.layers

Layers with keys: 

In [2]:
import os
import pandas as pd
import scanpy as sc

# Define the directory containing your .h5ad files
output_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets"

# Initialize a list to hold the data for each file
data_summary = []

# Loop through each file in the output directory
for file in os.listdir(output_dir):
    if file.endswith(".h5ad"):
        # Construct the full path to the file
        file_path = os.path.join(output_dir, file)
        
        # Load the .h5ad file
        adata = sc.read_h5ad(file_path)
        
        # Extract the required information
        num_genes = adata.n_vars
        num_cells = adata.n_obs
        
        # Assume presence of any columns in adata.var (beyond default indexes) indicates raw counts
        has_raw_counts = "Yes" if adata.var.shape[1] > 1 else "No"
        
        # Append the information to the list
        data_summary.append({
            "Dataset": file,
            "Number of Genes": num_genes,
            "Number of Cells": num_cells,
            "Has Raw Counts": has_raw_counts
        })

# Convert the list to a DataFrame
summary_df = pd.DataFrame(data_summary)

# Define the path to save the Excel file
excel_path = os.path.join(output_dir, "datasets_summary.xlsx")

# Save the DataFrame to an Excel file
summary_df.to_excel(excel_path, index=False)

print("Excel file has been created:", excel_path)


Excel file has been created: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/datasets_summary.xlsx


## Second search

In [1]:
import os
import requests

# Redownload "ReplogleWeissman" dataset from source paper link: https://plus.figshare.com/articles/dataset/_Mapping_information-rich_genotype-phenotype_landscapes_with_genome-scale_Perturb-seq_Replogle_et_al_2022_processed_Perturb-seq_datasets/20029387
input_adata_file = ["https://plus.figshare.com/ndownloader/files/35773219",
"https://plus.figshare.com/ndownloader/files/35775507",
"https://plus.figshare.com/ndownloader/files/35775606"]

# The directory where you want to save the files
output_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop over the list of URLs
for url in input_adata_file:
    # Extract the file name from the URL
    file_name = url.split('/')[-1]
    # Define the path to save the file
    output_path = os.path.join(output_dir, file_name)
    
    # Print the file being downloaded (optional)
    print(f"Downloading {file_name}...")
    
    # Make the request to download the file
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Write the content to the file
        with open(output_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file_name} successfully.")
    else:
        print(f"Failed to download {file_name}. Status code: {response.status_code}")

print("All files have been downloaded.")


Downloading 35773219...
Downloaded 35773219 successfully.
Downloading 35775507...
Downloaded 35775507 successfully.
Downloading 35775606...
Downloaded 35775606 successfully.
All files have been downloaded.


In [13]:
import glob
import scanpy as sc

# Update the glob pattern to match only files ending with '.h5ad'
for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/*.h5ad"):
    adata = sc.read_h5ad(dataset_file)
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print(f"Dataset: {dataset_file}")
    print("Number of genes:", n_genes)
    print("Number of cells:", n_cells)
    print(adata.layers)
    print(adata.X)
    print(adata.var_names)


#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/35775507.h5ad
Number of genes: 8248
Number of cells: 1989578
Layers with keys: 
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 4. ... 0. 0. 0.]
 [0. 1. 3. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [2. 0. 2. ... 1. 0. 1.]]
Index(['ENSG00000237491', 'ENSG00000228794', 'ENSG00000188976',
       'ENSG00000187961', 'ENSG00000188290', 'ENSG00000187608',
       'ENSG00000078808', 'ENSG00000176022', 'ENSG00000160087',
       'ENSG00000131584',
       ...
       'ENSG00000198840', 'ENSG00000212907', 'ENSG00000198886',
       'ENSG00000198786', 'ENSG00000198695', 'ENSG00000198727',
       'ENSG00000278704', 'ENSG00000278384', 'ENSG00000271254',
       'ENSG00000276345'],
      dtype='object', name='gene_id', length=8248)

#########################################################################
Dataset: /Genomics/pritykinlab

In [14]:
import glob
import scanpy as sc

# Update the glob pattern to match only files ending with '.h5ad'
for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/*.h5ad"):
    adata = sc.read_h5ad(dataset_file)
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print(f"Dataset: {dataset_file}")
    print("Number of genes:", n_genes)
    print("Number of cells:", n_cells)
    print(adata.layers)
    print(adata.var)
    print(adata.var_names)


#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/35775507.h5ad
Number of genes: 8248
Number of cells: 1989578
Layers with keys: 
                  gene_name         chr   start      end           class  \
gene_id                                                                    
ENSG00000237491   LINC01409        chr1  778747   810065  gene_version10   
ENSG00000228794   LINC01128        chr1  825138   868202   gene_version9   
ENSG00000188976       NOC2L        chr1  944203   959309  gene_version11   
ENSG00000187961      KLHL17        chr1  960584   965719  gene_version14   
ENSG00000188290        HES4        chr1  998962  1000172  gene_version10   
...                     ...         ...     ...      ...             ...   
ENSG00000198727      MT-CYB        chrM   14747    15887   gene_version2   
ENSG00000278704  BX004987.1  GL000009.2   56140    58376   gene_version1   


In [2]:
import os
import pandas as pd
import scanpy as sc
import numpy as np
from scipy import sparse

# Define the directory containing your .h5ad files
output_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/harmonized_perturb_datasets"

# List of datasets to discard
datasets_to_discard = [
    "SrivatsanTrapnell2020_sciplex3.h5ad",
    "SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen.h5ad",
    "SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_11_screen.h5ad",
    "FrangiehIzar2021_protein.h5ad",
    "PapalexiSatija2021_eccite_protein.h5ad",
    "PapalexiSatija2021_eccite_arrayed_protein.h5ad"
]

# Initialize lists to hold the data for selected and discarded files
data_summary_selected = []
data_summary_discarded = []

# Loop through each file in the output directory
for file in os.listdir(output_dir):
    if file.endswith(".h5ad"):
        # Check if the file is selected or discarded
        if file in datasets_to_discard:
            list_to_use = data_summary_discarded
        else:
            list_to_use = data_summary_selected
            
        # Construct the full path to the file
        file_path = os.path.join(output_dir, file)

        # Get the file size in gigabytes
        file_size_gb = os.path.getsize(file_path) / (1024**3)  # convert bytes to gigabytes
        
        # Load the .h5ad file
        adata = sc.read_h5ad(file_path)
        
        # Extract the required information
        num_genes = adata.n_vars
        num_cells = adata.n_obs
        
        # Modify here to handle sparse matrix correctly
        # Extract a small subset of the data (e.g., first cell) to dense format for checking
        first_cell_data = adata.X[0].toarray() if sparse.issparse(adata.X) else adata.X[0]
        has_raw_counts = "No" if np.any(first_cell_data % 1 != 0) else "Yes"
        
        # Append the information to the respective list
        list_to_use.append({
            "Dataset": file,
            "Number of Genes": num_genes,
            "Number of Cells": num_cells,
            "Has Raw Counts": has_raw_counts,
            "Size (GB)": file_size_gb  # Add the file size in gigabytes
        })

# Convert the lists to DataFrames
summary_df_selected = pd.DataFrame(data_summary_selected)
summary_df_discarded = pd.DataFrame(data_summary_discarded)

# Sort the DataFrames by the "Dataset" column alphabetically
summary_df_selected = summary_df_selected.sort_values(by="Dataset")
summary_df_discarded = summary_df_discarded.sort_values(by="Dataset")

# Define the paths to save the Excel files
excel_path_selected = os.path.join(output_dir, "selected_datasets_summary.xlsx")
excel_path_discarded = os.path.join(output_dir, "discarded_datasets_summary.xlsx")

# Save the DataFrames to Excel files
summary_df_selected.to_excel(excel_path_selected, index=False)
summary_df_discarded.to_excel(excel_path_discarded, index=False)

print("Excel files have been created:", excel_path_selected, "and", excel_path_discarded)


KeyboardInterrupt: 

In [3]:
import os
import pandas as pd

# Define the directories
metadata_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/dataset_metadata"
datasets_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/harmonized_perturb_datasets"

# Path to the existing summary Excel sheet
excel_path = os.path.join(metadata_dir, "selected_datasets_summary.xlsx")

# Load the summary Excel sheet into a pandas DataFrame
summary_df = pd.read_excel(excel_path)

# Check if the "Dataset" column exists in the DataFrame
if "Dataset" not in summary_df.columns:
    raise ValueError("The column 'Dataset' does not exist in the Excel sheet.")

# Add a new column for the size, initialized with zeros
summary_df["Size (GB)"] = 0.0

# Iterate through the DataFrame to calculate the size of each dataset file
for index, row in summary_df.iterrows():
    dataset_file = row["Dataset"]
    file_path = os.path.join(datasets_dir, dataset_file)
    
    # Check if the file exists
    if os.path.isfile(file_path):
        # Get the file size in gigabytes and update the DataFrame
        file_size_gb = os.path.getsize(file_path) / (1024**3)  # convert bytes to gigabytes
        summary_df.at[index, "Size (GB)"] = file_size_gb
    else:
        print(f"File {dataset_file} not found in the directory.")

# Save the updated DataFrame to a new Excel file
updated_excel_path = os.path.join(metadata_dir, "updated_selected_datasets_summary.xlsx")
summary_df.to_excel(updated_excel_path, index=False)

print(f"Updated Excel file has been created at {updated_excel_path}.")


Updated Excel file has been created at /Genomics/pritykinlab/yujie/preprocessing_benchmarking/dataset_metadata/updated_selected_datasets_summary.xlsx.


## Find perturbation column names for all datasets

In [2]:
import glob
import scanpy as sc

for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/*.h5ad"):
    adata = sc.read_h5ad(dataset_file)
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print(f"Dataset: {dataset_file}")
    print("Number of genes:", n_genes)
    print("Number of cells:", n_cells)
    print(adata.var)


#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/rpe1_raw_singlecell_01.h5ad
Number of genes: 8749
Number of cells: 247914
                gene_name   chr    start      end           class strand  \
gene_id                                                                    
ENSG00000188976     NOC2L  chr1   944203   959309  gene_version11      -   
ENSG00000187583   PLEKHN1  chr1   966482   975865  gene_version11      +   
ENSG00000188290      HES4  chr1   998962  1000172  gene_version10      -   
ENSG00000187608     ISG15  chr1  1001138  1014540  gene_version10      +   
ENSG00000188157      AGRN  chr1  1020120  1056118  gene_version15      +   
...                   ...   ...      ...      ...             ...    ...   
ENSG00000212907   MT-ND4L  chrM    10470    10766   gene_version2      +   
ENSG00000198886    MT-ND4  chrM    10760    12137   gene_version2      +   
ENSG00

In [3]:
import glob
import scanpy as sc

for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/*.h5ad"):
    adata = sc.read_h5ad(dataset_file)
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print(f"Dataset: {dataset_file}")
    print("Number of genes:", n_genes)
    print("Number of cells:", n_cells)
    print(adata.obs)


#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets_from_source/rpe1_raw_singlecell_01.h5ad
Number of genes: 8749
Number of cells: 247914
                     gem_group           gene          gene_id     transcript  \
cell_barcode                                                                    
AAACCCAAGAAACTAC-53         53         MRPS31  ENSG00000102738           P1P2   
AAACCCAAGAAGCCAC-51         51       LRRC37A3  ENSG00000176809           P1P2   
AAACCCAAGAAGCGAA-32         32          SRCAP  ENSG00000080603           P1P2   
AAACCCAAGAATACAC-44         44           WBP1  ENSG00000239779           P1P2   
AAACCCAAGAATCGAT-43         43          RRP12  ENSG00000052749           P1P2   
...                        ...            ...              ...            ...   
TTTGTTGTCTGCACCT-44         44            MAX  ENSG00000125952           P1P2   
TTTGTTGTCTGGGCAC-32         32       

In [4]:
import glob
import scanpy as sc

for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/*.h5ad"):
    adata = sc.read_h5ad(dataset_file)
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print(f"Dataset: {dataset_file}")
    print("Number of genes:", n_genes)
    print("Number of cells:", n_cells)
    print(adata.var)


#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen.h5ad
Number of genes: 4191
Number of cells: 112260
                ncounts  ncells
gene_symbol                    
ANGPT1         519539.0  101635
ANKRD46        253381.0   94135
ASAP1          131649.0   70682
ATAD2           39721.0   32027
ATP6V1C1       552730.0  107641
...                 ...     ...
WDYHV1         249371.0   93806
YWHAZ        11406809.0  112254
ZFPM2          119970.0   56719
ZHX1           908293.0  110380
ZNF706        5329841.0  112126

[4191 rows x 2 columns]

#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/AdamsonWeissman2016_GSM2406681_10X010.h5ad
Number of genes: 32738
Number of cells: 65337
                   ensembl_id  ncounts  ncells
gene_symbol        

In [5]:
import glob
import scanpy as sc

for dataset_file in glob.glob("/Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/*.h5ad"):
    adata = sc.read_h5ad(dataset_file)
    n_cells, n_genes = adata.shape
    print()
    print("#########################################################################")
    print(f"Dataset: {dataset_file}")
    print("Number of genes:", n_genes)
    print("Number of cells:", n_cells)
    print(adata.obs)


#########################################################################
Dataset: /Genomics/pritykinlab/yujie/preprocessing_benchmarking/datasets/SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen.h5ad
Number of genes: 4191
Number of cells: 112260
                                    replicate tissue_type cell_line  cancer  \
cell_barcode                                                                  
TGATTGACAAACCTGAGAGCTATA-sample_14  sample_14   cell_line      K562    True   
TGATTGACAAACCTGAGGTGACCA-sample_14  sample_14   cell_line      K562    True   
TGATTGACAAACCTGAGGTGCTTT-sample_14  sample_14   cell_line      K562    True   
TGATTGACAAACCTGAGTCGAGTG-sample_14  sample_14   cell_line      K562    True   
TGATTGACAAACCTGCAACTTGAC-sample_14  sample_14   cell_line      K562    True   
...                                       ...         ...       ...     ...   
TCCTGAGCCCGTACTAGGTGCAAC-sample_2    sample_2   cell_line      K562    True   
TCCTGAGCCCGTACTAGTTCGATC-sample_2