In [None]:
import scanpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Cargar datos

Leemos los datos

In [None]:
rep1 = scanpy.read_10x_mtx("data/WT1", make_unique=True)
rep2 = scanpy.read_10x_mtx("data/WT2", make_unique=True)
rep3 = scanpy.read_10x_mtx("data/WT3", make_unique=True)

all_data = scanpy.concat([rep1, rep2, rep3])

# Ver cómo funciona esto
all_data.obs_names_make_unique()

### Create gene name mapping (from "gene symbols" to TAIR IDs)

Write gene names to a text file. This will be loaded by an R script which will query Biomart for the corresponding TAIR IDs (gene IDs that start with `AT`), this is important for filtering the chloroplastic and mitochondrial genes which are recognized by their names.

In [None]:
with open("gene_names.txt", "wt") as genes_f:
    genes_f.write("\n".join(all_data.var_names.to_list()))

In [None]:
# genename_mapping = pd.read_csv("gene_symbols_to_tair_ids.txt")
# genename_mapping = { row.external_gene_name: row.tair_locus for index, row in genename_mapping.iterrows() }

___

In [None]:
print(f"Hay {len(rep1.obs_names)} células")
print(f"Hay {len(rep1.var_names)} genes")

print(f"Hay {len(rep2.obs_names)} células")
print(f"Hay {len(rep2.var_names)} genes")

print(f"Hay {len(rep3.obs_names)} células")
print(f"Hay {len(rep3.var_names)} genes")

print(f"Hay {len(all_data.obs_names)} células")
print(f"Hay {len(all_data.var_names)} genes")

___
Filter cells for at least 3 genes, filter genes for at least 200 cells.

In [None]:
MIN_GENES, MIN_CELLS = 3, 200

In [None]:
scanpy.pp.filter_cells(all_data, min_genes=MIN_GENES)
print(all_data.shape)
scanpy.pp.filter_genes(all_data, min_cells=MIN_CELLS)
print(all_data.shape)

Identify chloroplastic and mitocondrial genes:

In [None]:
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)
genename_mapping_df = gp.convert(all_data.var_names.to_list(), organism="athaliana", target_namespace="TAIR_LOCUS")

In [None]:
genename_mapping_df.head()

Creamos un diccionario para usar luego:

In [None]:
genename_mapping = {row.incoming: row.converted for i, row in genename_mapping_df.iterrows()}

In [None]:
def get_chloroplast_genes(adata: scanpy.AnnData):
    return adata.var_names[tair_ids.str.startswith("ATCG")].to_list()

def get_mitochondrial_genes(adata: scanpy.AnnData):
    return adata.var_names[tair_ids.str.startswith("ATMG")].to_list()

In [None]:
tair_ids = pd.Series(all_data.var_names).apply(lambda x: genename_mapping.get(x, x))

chloroplast_genes     = get_chloroplast_genes(all_data)
mitochondrial_genes   = get_mitochondrial_genes(all_data)

In [None]:
print(f"Chloroplastic genes: {chloroplast_genes}")
print(f"Mitochondrial genes: {mitochondrial_genes}")

all_data.shape

In [None]:
all_data.obs['percent_pt'] = (
    all_data[:, chloroplast_genes].X.sum(axis=1) / all_data.X.sum(axis=1)
) * 100  # Convert to percentage

all_data.obs['percent_mt'] = (
    all_data[:, mitochondrial_genes].X.sum(axis=1) / all_data.X.sum(axis=1)
) * 100

In [None]:
all_data.obs['percent_pt'][all_data.obs['percent_pt'] > 5]

In [None]:
all_data.obs['percent_mt'][all_data.obs['percent_mt'] > 5]

___

Check the docs: https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.highly_variable_genes.html

In [None]:
FLAVOR = "seurat_v3"
scanpy.pp.highly_variable_genes(all_data, flavor=FLAVOR, n_top_genes=2000, batch_key=None)  # min_mean=0.0125, max_mean=3, min_disp=0.5

all_data.var

___
Top highly variable genes:

How is `variances_norm` calculated?

In [None]:
all_data.var.sort_values('variances_norm').head(20)

Genes that are not highly variable:

In [None]:
all_data.var.sort_values('variances_norm').head(20)

In [None]:
RANDOM_STATE = 142

In [None]:
# scanpy.pp.normalize_total(all_data)

Acá hay un paso de transformación de las cuentas que es opcional (pero se suele hacer). En lo que sigue hacemos normalización 

In [None]:
scanpy.experimental.pp.normalize_pearson_residuals_pca(all_data, n_comps=50)

In [None]:
# plt.plot(np.cumsum(all_data.uns['pca']['variance_ratio']))
plt.plot(all_data.uns['pca']['variance'])

In [None]:
# scanpy.pp.pca(all_data, n_comps=50, mask_var="highly_variable", random_state=RANDOM_STATE)

In [None]:
scanpy.pp.neighbors(all_data, n_neighbors=10, n_pcs=50, random_state=RANDOM_STATE)

In [None]:
scanpy.tl.leiden(all_data, resolution=1.5)

## UMAP


In [None]:
scanpy.tl.umap(all_data, random_state=RANDOM_STATE)

In [None]:
plt.figure(figsize=(10, 10))
scanpy.pl.umap(all_data, color='leiden', 
           palette='Set2',  # You can choose different color palettes
           title='UMAP Colored by Clusters')
# plt.tight_layout()
plt.show()

In [None]:
sorted(all_data.obs['leiden'].unique().astype(int))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data={}
for v in sorted(all_data.obs['leiden'].unique().astype(int)):
    data[v]=all_data[all_data.obs['leiden']==str(v)].shape[0]# /all_data.shape[0]*100
df = pd.DataFrame.from_dict(data, orient='index', columns=['cell counts'])    
df['cluster'] = df.index
df =df.reset_index(drop=True)
sns.barplot(data=df, x='cluster', y='cell counts')
plt.show()


In [None]:
import warnings
warnings.filterwarnings("ignore")
scanpy.tl.rank_genes_groups(all_data, 'leiden', method='wilcoxon') # "t-test"

In [None]:
scanpy.pl.rank_genes_groups(all_data, n_genes=20, sharey=False)

Our clusters:

In [None]:
genes_in_my_clusters = pd.DataFrame(all_data.uns['rank_genes_groups']['names']).head(200)

In [None]:
for i in range(genes_in_my_clusters.shape[0]):
    for j in range(genes_in_my_clusters.shape[1]):
        genes_in_my_clusters.iloc[i,j] = genename_mapping.get(genes_in_my_clusters.iloc[i,j], genes_in_my_clusters.iloc[i,j])

In [None]:
genes_in_my_clusters = { i: set(genes_in_my_clusters[str(i)]) for i in range(genes_in_my_clusters.shape[1])}

Clusters in paper:

In [None]:
sheet_names = [  "C0C4C10C11 healthy mesophyl", 
  "C1 responsive epidermal cells",
  "C2 vascular S cells",
  "C5",
  "C8 responsive mesophyl cells",
  "C9 healthy epidermal cells",
  "C13",
  "C16 guard cells"
]

genes_in_cluster = dict()

for cluster in sheet_names:
    supp_material_clusters = pd.read_excel("~/Postdoc/Papers/AT_PST_ScienceDirect_files_06Nov2024_13-40-25.581/1-s2.0-S2590346223002043-mmc5.xlsx", sheet_name=cluster)
    genes = supp_material_clusters[~supp_material_clusters['Gene model'].isna()]["Gene model"]
    genes_in_cluster[cluster] = set(genes.to_list())

In [None]:
conting_matrix = np.zeros((len(genes_in_my_clusters), len(genes_in_cluster)), int)

In [None]:
for i, (cluster_i, genes_i) in enumerate(genes_in_my_clusters.items()):
    for j, (cluster_j, genes_j) in enumerate(genes_in_cluster.items()):
        conting_matrix[i,j] = len(genes_i.intersection(genes_j))

In [None]:
pd.DataFrame(conting_matrix).T

### Marker genes

In [None]:
Guard_cell = ['GC-AT5G25980','GC-AT5G48485','GC-AT1G62480','GC-AT3G16400','GC-AT2G15830','GC-AT1G71050','GC-AT2G19810','GC-AT4G37870','GC-AT5G66400','GC-AT3G23730','GC-AT3G24140','GC-AT5G66440','GC-AT3G56620','GC-AT4G37430','GC-AT2G34655','GC-AT2G47260','GC-AT5G42970','GC-AT3G58640','GC-AT1G23170','GC-AT1G29050']
Companion_cell = ['CC-AT1G23130','CC-AT1G67865','CC-AT1G64370','CC-AT4G19840','CC-AT2G18328','CC-AT5G18600','CC-AT1G67860','CC-AT1G67870','CC-AT5G45350','CC-AT2G32870','CC-AT5G04080','CC-AT5G22090','CC-AT4G00780','CC-AT1G07610','CC-AT4G16008','CC-AT1G06830','CC-AT2G16740','CC-AT2G30540','CC-AT4G16000','CC-AT4G15690']
Epidermia_cell = ['EC-AT2G38540','EC-AT1G66100','EC-AT1G09310','EC-AT3G51600','EC-AT5G25610','EC-AT5G44020','EC-AT3G16370','EC-AT2G27385','EC-AT3G26450','EC-AT1G68530','EC-AT2G32690','EC-AT4G04840','EC-AT4G23670','EC-AT1G29660','EC-AT5G13930','EC-AT5G64770','EC-AT4G39330','EC-AT1G29670','EC-AT1G55260','EC-AT2G26250']
Mesophyl_cell = ['MC-AT2G10940','MC-AT5G38430','MC-AT3G08940','MC-AT1G72610','MC-AT3G27690','MC-AT2G05070','MC-AT1G12090','MC-AT1G29910','MC-AT2G34420','MC-AT2G34430','MC-AT1G15820','MC-AT2G21330','MC-AT1G06680','MC-AT3G59400','MC-AT2G05100','MC-AT1G67090','MC-AT3G54890','MC-AT5G66570','MC-AT4G38970','MC-AT1G44575']
Mesophyl_cell_2 = ['MC2-AT1G18740','MC2-AT1G74930','MC2-AT1G27730','MC2-AT2G44840','MC2-AT1G80840','MC2-AT3G44260','MC2-AT5G12030','MC2-AT5G12020','MC2-AT1G74450','MC2-AT4G24570','MC2-AT3G56880','MC2-AT1G71000','MC2-AT5G66650','MC2-AT4G27652','MC2-AT3G46230','MC2-AT3G12580','MC2-AT3G55980','MC2-AT4G34410','MC2-AT5G52050','MC2-AT1G07400']
Hydathode_cell = ['HC-AT3G16670','HC-AT3G05730','HC-AT3G16660','HC-AT1G56710','HC-AT3G09330','HC-AT1G22900','HC-AT1G08090','HC-AT4G36260','HC-AT4G32950','HC-AT2G43610','HC-AT4G23550','HC-AT2G19990','HC-AT1G62510','HC-AT2G33175','HC-AT2G38940','HC-AT3G14060','HC-AT3G60700','HC-AT1G19610','HC-AT5G60910']#,'HC-AT1G08757']
S_cell = ['SC-AT1G78370','SC-AT3G19710','SC-AT2G30860','SC-AT1G80520','SC-AT2G43100','SC-AT5G23010','SC-AT5G02380','SC-AT2G22330','SC-AT3G14990','SC-AT2G46650','SC-AT2G26690','SC-AT5G14200','SC-AT2G22860','SC-AT5G01600','SC-AT4G14040','SC-AT3G11930','SC-AT2G37170','SC-AT3G15450','SC-AT5G03610','SC-AT1G11580']