In [1]:
import pandas as pd
import math
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sys
import gzip as gz
import scipy
from importlib import reload
import glob
import tqdm
import itertools
from scipy.spatial.distance import cdist
import scanpy as sc

import sys
sys.path.append('/Genomics/pritykinlab/dillon/perturbseq/scripts/utils')
import dataloader
import umap_analysis
import reimplementation
import normalization
import adata_utils
from scipy.stats import mannwhitneyu

In [2]:
plt.rcParams["axes.titlesize"] = 25
plt.rcParams["axes.labelsize"] = 25
plt.rcParams["xtick.labelsize"] = 20
plt.rcParams["ytick.labelsize"] = 20
plt.rcParams["lines.markersize"] = 5
plt.rcParams['axes.grid'] = False
plt.rcParams['xtick.bottom'] = True
plt.rcParams['ytick.left'] = True
plt.rcParams['xtick.major.size'] = 5
plt.rcParams['xtick.minor.size'] = 3
plt.rcParams['ytick.major.size'] = 5
plt.rcParams['ytick.minor.size'] = 3
plt.rcParams['axes.labelcolor'] = 'black'
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['legend.fontsize'] = 15
plt.rcParams['legend.title_fontsize'] = 15
plt.rcParams['legend.edgecolor'] = 'dimgray'

# Cleaning up the data

Preprocess this data:
```
/Genomics/pritykinlab/share/perturbseq/GW_perturbseq/test/K562_essential_other/
    KD6_10_essential_barcodes.tsv.gz
    KD6_10_essential_features.tsv.gz
    KD6_10_essential_matrix.mtx.gz
    KD6_11_essential_barcodes.tsv.gz
    KD6_11_essential_features.tsv.gz
    KD6_11_essential_matrix.mtx.gz
    KD6_12_essential_barcodes.tsv.gz
    KD6_12_essential_features.tsv.gz
    KD6_12_essential_matrix.mtx.gz
    KD6_13_essential_barcodes.tsv.gz
    KD6_13_essential_features.tsv.gz
    KD6_13_essential_matrix.mtx.gz
    KD6_14_essential_barcodes.tsv.gz
    KD6_14_essential_features.tsv.gz
    KD6_14_essential_matrix.mtx.gz
    KD6_15_essential_barcodes.tsv.gz
    KD6_15_essential_features.tsv.gz
    ...
    KD6_{x}_essential_features.tsv.gz
    KD6_{x}_essential_barcodes.tsv.gz
    KD6_{x}_essential_matrix.mtx.gz
    ...
```
Into this format:
```
/Genomics/pritykinlab/share/perturbseq/GW_perturbseq/test/K562_essential_raw
    1/
        barcodes.tsv.gz
        matrix.mtx.gz
        features.tsv.gz
    2/
        barcodes.tsv.gz
        matrix.mtx.gz
        features.tsv.gz
    ...
    {x}/
        barcodes.tsv.gz
        matrix.mtx.gz
        features.tsv.gz
```

In [3]:
import os
import requests

# Redownload "ReplogleWeissman" dataset from source paper link: https://plus.figshare.com/articles/dataset/_Mapping_information-rich_genotype-phenotype_landscapes_with_genome-scale_Perturb-seq_Replogle_et_al_2022_processed_Perturb-seq_datasets/20029387
input_adata_file = ["https://plus.figshare.com/ndownloader/files/36000572",
"https://plus.figshare.com/ndownloader/files/36000888"]

# The directory where you want to save the files
output_dir = "/Genomics/pritykinlab/yujie/preprocessing_benchmarking/Replogle"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop over the list of URLs
for url in input_adata_file:
    # Extract the file name from the URL
    file_name = url.split('/')[-1]
    # Define the path to save the file
    output_path = os.path.join(output_dir, file_name)
    
    # Print the file being downloaded (optional)
    print(f"Downloading {file_name}...")
    
    # Make the request to download the file
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Write the content to the file
        with open(output_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file_name} successfully.")
    else:
        print(f"Failed to download {file_name}. Status code: {response.status_code}")

print("All files have been downloaded.")

Downloading 36000572...
Downloaded 36000572 successfully.
Downloading 36000888...
Downloaded 36000888 successfully.
All files have been downloaded.
