In [1]:
import os
from pathlib import Path

import scanpy as sc
import pandas as pd
import numpy as np

def download_file(url, output_dir, use_cache=True):
    import requests
    filename = output_dir / os.path.basename(url)
    
    if use_cache and filename.exists():
        print(f"File already exists, skipping: {filename}")
        return filename

    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    with open(filename, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    
    print(f"Downloaded: {filename}")
    return filename

data_dir = Path("..") / "data"
data_dir.mkdir(exist_ok=True)

file_urls = [
    "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE84nnn/GSE84498/suppl/GSE84498%5Fexperimental%5Fdesign.txt.gz",
    "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE84nnn/GSE84498/suppl/GSE84498%5Fumitab.txt.gz"
]

for url in file_urls:
    download_file(url=url, output_dir=data_dir)

obs = (pd.read_csv(data_dir / os.path.basename(file_urls[0]), sep="\t")
       .set_index("well"))
count_df = (pd.read_csv(data_dir / os.path.basename(file_urls[1]), sep="\t")
            .set_index("gene").T
            .loc[obs.index, :])
adata = sc.AnnData(
    X = count_df.values.astype(np.float32),
    obs = obs, 
    var = pd.DataFrame(index=[c.split(";")[0] for c in count_df.columns])
)
adata = adata[:, adata.X.sum(axis=0) >= 20].copy()
# remove batches of different cells (probably non-hepatocytes)
adata = adata[~adata.obs["batch"].isin(["AB630", "AB631"])].copy()
adata

Downloaded: ../data/GSE84498%5Fexperimental%5Fdesign.txt.gz
Downloaded: ../data/GSE84498%5Fumitab.txt.gz


AnnData object with n_obs × n_vars = 1534 × 10400
    obs: 'batch', 'seq_batch_ID', 'plate_coordinates', 'pool_barcode', 'cell_barcode', 'plate', 'subject'