In [1]:
import scanpy
import pandas as pd
import numpy as np

### Cargar datos

Leemos los datos

In [2]:
rep1 = scanpy.read_10x_mtx("WT1", make_unique=True)
rep2 = scanpy.read_10x_mtx("WT2", make_unique=True)
rep3 = scanpy.read_10x_mtx("WT3", make_unique=True)

all_data = scanpy.concat([rep1, rep2, rep3])

# Ver cómo funciona esto
all_data.obs_names_make_unique()

  utils.warn_names_duplicates("obs")


### Create gene name mapping (from "gene symbols" to TAIR IDs)

Write gene names to a text file. This will be loaded by an R script which will query Biomart for the corresponding TAIR IDs (gene IDs that start with `AT`), this is important for filtering the chloroplastic and mitochondrial genes which are recognized by their names.

In [3]:
with open("gene_names.txt", "wt") as genes_f:
    genes_f.write("\n".join(all_data.var_names.to_list()))

In [4]:
genename_mapping = pd.read_csv("gene_symbols_to_tair_ids.txt")
genename_mapping = { row.external_gene_name: row.tair_locus for index, row in genename_mapping.iterrows() }

___

In [5]:
print(f"Hay {len(rep1.obs_names)} células")
print(f"Hay {len(rep1.var_names)} genes")

print(f"Hay {len(rep2.obs_names)} células")
print(f"Hay {len(rep2.var_names)} genes")

print(f"Hay {len(rep3.obs_names)} células")
print(f"Hay {len(rep3.var_names)} genes")

print(f"Hay {len(all_data.obs_names)} células")
print(f"Hay {len(all_data.var_names)} genes")

Hay 21742 células
Hay 27546 genes
Hay 7121 células
Hay 27546 genes
Hay 12319 células
Hay 27546 genes
Hay 41182 células
Hay 27546 genes


___
Filter cells for at least 3 genes, filter genes for at least 200 cells.

In [6]:
MIN_GENES, MIN_CELLS = 3, 200

In [7]:
scanpy.pp.filter_cells(all_data, min_genes=MIN_GENES)
scanpy.pp.filter_genes(all_data, min_cells=MIN_CELLS)

In [8]:
all_data.shape

(41182, 15281)

Identify chloroplastic and mitocondrial genes:

In [19]:
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
genename_mapping_df = gp.convert(all_data.var_names.to_list(), organism="athaliana", target_namespace="TAIR_LOCUS")

In [30]:
genename_mapping_df.converted

0        AT1G01010
1        AT1G01020
2        AT1G01030
3        AT1G01040
4        AT1G01050
           ...    
15723    ATCG01100
15724    ATCG01110
15725    AT1G04270
15726    ATCG01120
15727    ATCG01130
Name: converted, Length: 15728, dtype: object

In [28]:
tair_ids

0           NAC001
1        AT1G01020
2        AT1G01030
3        AT1G01040
4             PPA1
           ...    
15276         NDHI
15277         NDHA
15278         NDHH
15279    AT1G04270
15280       YCF1.2
Length: 15281, dtype: object

In [None]:
def get_chloroplast_genes(adata: scanpy.AnnData):
    return adata.var_names[tair_ids.str.startswith("ATCG")].to_list()

def get_mitochondrial_genes(adata: scanpy.AnnData):
    return adata.var_names[tair_ids.str.startswith("ATMG")].to_list()

In [9]:
tair_ids = pd.Series(all_data.var_names).apply(lambda x: genename_mapping.get(x, x))

chloroplast_genes     = get_chloroplast_genes(all_data)
mitochondrial_genes   = get_mitochondrial_genes(all_data)

In [10]:
print(f"Chloroplastic genes: {chloroplast_genes}")
print(f"Mitochondrial genes: {mitochondrial_genes}")

all_data.shape

Chloroplastic genes: ['rps3', 'rpl16']
Mitochondrial genes: ['ATMGT7', 'cox2', 'atp1', 'nad6', 'ccmFC', 'nad4', 'rpl2', 'nad7', 'ccmFN1', 'nad4L']


(41182, 15281)

In [11]:
all_data.obs['percent_pt'] = (
    all_data[:, chloroplast_genes].X.sum(axis=1) / all_data.X.sum(axis=1)
) * 100  # Convert to percentage

all_data.obs['percent_mt'] = (
    all_data[:, mitochondrial_genes].X.sum(axis=1) / all_data.X.sum(axis=1)
) * 100

In [12]:
all_data.obs['percent_pt'][all_data.obs['percent_pt'] > 5]

Series([], Name: percent_pt, dtype: float64)

In [13]:
all_data.obs['percent_mt'][all_data.obs['percent_mt'] > 5]

Series([], Name: percent_mt, dtype: float64)

___

Check the docs: https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.highly_variable_genes.html

In [14]:
FLAVOR = "seurat_v3"
scanpy.pp.highly_variable_genes(all_data, flavor=FLAVOR, batch_key=None)

all_data.var

Unnamed: 0,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm
NAC001,7685,False,,0.329707,1.010399,0.887774
ARV1,2592,False,,0.076077,0.104870,0.775530
NGA3,6444,True,1934.0,0.359842,1.902006,1.441155
DCL1,702,False,,0.019377,0.027744,1.029509
PPA1,14675,False,,0.962824,5.315657,0.715945
...,...,...,...,...,...,...
NDHI,22692,False,,1.928294,18.657421,0.691271
NDHA,16873,False,,0.993007,4.941453,0.628981
NDHH,6584,False,,0.240858,0.543696,0.795501
RPS15,3897,False,,0.116750,0.161887,0.683103


___
Top highly variable genes:

How is `variances_norm` calculated?

In [15]:
all_data.var.sort_values('variances_norm').head(20)

Unnamed: 0,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm
NDHD,35110,False,,5.021854,46.351092,0.282451
PSBK,22654,False,,1.292434,3.854671,0.301896
PETD,32572,False,,3.610267,28.247706,0.32087
PSBA,41018,False,,41.801491,3070.389165,0.335894
PSBC,38152,False,,9.018843,177.750064,0.358209
ATPI,26947,False,,2.164416,12.222621,0.36429
NDHE,35040,False,,5.65599,74.943528,0.364837
PSBI,28029,False,,2.357753,14.749089,0.374113
CCSA,27152,False,,2.184644,12.806051,0.375045
PETB,40401,False,,44.875407,3962.150354,0.378521


Genes that are not highly variable:

In [16]:
all_data.var.sort_values('variances_norm').head(20)

Unnamed: 0,n_cells,highly_variable,highly_variable_rank,means,variances,variances_norm
NDHD,35110,False,,5.021854,46.351092,0.282451
PSBK,22654,False,,1.292434,3.854671,0.301896
PETD,32572,False,,3.610267,28.247706,0.32087
PSBA,41018,False,,41.801491,3070.389165,0.335894
PSBC,38152,False,,9.018843,177.750064,0.358209
ATPI,26947,False,,2.164416,12.222621,0.36429
NDHE,35040,False,,5.65599,74.943528,0.364837
PSBI,28029,False,,2.357753,14.749089,0.374113
CCSA,27152,False,,2.184644,12.806051,0.375045
PETB,40401,False,,44.875407,3962.150354,0.378521


In [17]:
scanpy.pp.pca()

TypeError: pca() missing 1 required positional argument: 'data'

In [None]:
all_data.

In [None]:
row_nnz = np.diff(all_data.X.indptr)  # Non-zero count per row
col_nnz = np.bincount(all_data.X.indices, minlength=all_data.X.shape[1])  # Non-zero count per column
len(row_nnz[row_nnz <= 300])
len(col_nnz[col_nnz <= 3])

In [None]:
all_data.to_df().astype(int)