In [None]:
from pathlib import Path

import scanpy as sc
import numpy as np

project_path = Path(".")

adata = sc.read_h5ad(
    project_path
    / ".."
    / "gpp_bench"
    / "data"
    / "prc"
    / "replogle2022_k562_gwps"
    / "replogle2022_k562_gwps_adata.h5ad"
)

# parameters
n_cells = 1_000_000
random_state = 42

rng = np.random.default_rng(random_state)
n_obs = adata.n_obs
if n_obs > n_cells:
    idx = rng.choice(n_obs, size=n_cells, replace=False)
    adata = adata[idx].copy()

sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
#sc.pp.pca(adata, mask_var="highly_variable", n_comps=20)

In [6]:
n_cells = 1_000_000
random_state = 42

rng = np.random.default_rng(random_state)
n_obs = adata.n_obs
if n_obs > n_cells:
    idx = rng.choice(n_obs, size=n_cells, replace=False)
    adata = adata[idx].copy()

In [8]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)



In [9]:
sc.pp.pca(adata, mask_var="highly_variable", n_comps=20)

  Version(ad.__version__) < Version("0.9")


In [10]:
adata

AnnData object with n_obs × n_vars = 1000000 × 8248
    obs: 'batch', 'target_gene', 'target_gene_ensembl', 'guide_id', 'control_class', 'total_counts', 'total_counts_log10', 'detected_genes', 'n_genes_by_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ribo', 'pct_counts_ribo', 'residual_expression', 'S_score', 'G2M_score', 'phase'
    var: 'chr', 'start', 'end', 'class', 'strand', 'length', 'in_matrix', 'std', 'cv', 'fano', 'ensembl_id', 'ncounts', 'ncells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'disp', 'disp_rank', 'log10_disp', 'mean_log1p_norm_count', 'mean', 'var', 'log10_mean', 'log10_var', 'log10_predicted_var', 'predicted_var', 'predicted_std', 'std_var', 'std_var_rank', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [None]:
import partipy as pt

pt.compute_shuffled_pca(adata, mask_var="highly_variable", n_shuffle=25)
pt.plot_shuffled_pca(adata)

In [12]:
import partipy as pt

pt.set_obsm(adata=adata, obsm_key="X_pca", n_dimensions=20)

In [None]:
adata.X

AnnData object with n_obs × n_vars = 1000000 × 8248
    obs: 'batch', 'target_gene', 'target_gene_ensembl', 'guide_id', 'control_class', 'total_counts', 'total_counts_log10', 'detected_genes', 'n_genes_by_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ribo', 'pct_counts_ribo', 'residual_expression', 'S_score', 'G2M_score', 'phase'
    var: 'chr', 'start', 'end', 'class', 'strand', 'length', 'in_matrix', 'std', 'cv', 'fano', 'ensembl_id', 'ncounts', 'ncells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'disp', 'disp_rank', 'log10_disp', 'mean_log1p_norm_count', 'mean', 'var', 'log10_mean', 'log10_var', 'log10_predicted_var', 'predicted_var', 'predicted_std', 'std_var', 'std_var_rank', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca', 'AA_config', 'AA_results'
    obsm: 'X_pca'
    varm: 'PCs'

In [13]:
?pt.compute_archetypes

[31mSignature:[39m
pt.compute_archetypes(
    adata: [33m'anndata.AnnData'[39m,
    n_archetypes: [33m'int'[39m,
    n_restarts: [33m'int'[39m = [32m5[39m,
    init: [33m'str | None'[39m = [38;5;28;01mNone[39;00m,
    optim: [33m'str | None'[39m = [38;5;28;01mNone[39;00m,
    weight: [33m'None | str'[39m = [38;5;28;01mNone[39;00m,
    max_iter: [33m'int | None'[39m = [38;5;28;01mNone[39;00m,
    early_stopping: [33m'bool'[39m = [38;5;28;01mTrue[39;00m,
    rel_tol: [33m'float | None'[39m = [38;5;28;01mNone[39;00m,
    coreset_algorithm: [33m'None | str'[39m = [38;5;28;01mNone[39;00m,
    coreset_fraction: [33m'float'[39m = [32m0.1[39m,
    coreset_size: [33m'None | int'[39m = [38;5;28;01mNone[39;00m,
    delta: [33m'float'[39m = [32m0.0[39m,
    verbose: [33m'bool | None'[39m = [38;5;28;01mNone[39;00m,
    seed: [33m'int'[39m = [32m42[39m,
    n_jobs: [33m'int'[39m = -[32m1[39m,
    save_to_anndata: [33m'bool'[39m = [38;5

In [14]:
pt.compute_archetypes(adata=adata, n_archetypes=5, n_restarts=1)

