In [1]:
import joblib
import pandas as pd

In [2]:
gene_meta = pd.read_csv("../GeneMetadata.csv.gz", index_col=0)
gene_meta['length'] = gene_meta['end'] - gene_meta['start']

use_genes = gene_meta[gene_meta['length'] > 1000].index

In [3]:
gene_name_to_id = joblib.load('../gene_name_to_id_dict')
gene_id_to_name = joblib.load('../gene_id_to_name_dict')

## merge

In [6]:
# these 233 genes are enough to seperate major cell types
wei_gene_panel = pd.read_csv('final_panel.csv', index_col = 0)
wei_gene_panel = wei_gene_panel[wei_gene_panel['diff'] > 0.4]
wei_gene_panel = wei_gene_panel[wei_gene_panel['Name'] == 'MajorType']
wei_gene_panel['Name'].value_counts()

MajorType    233
Name: Name, dtype: int64

In [7]:
gene_table_with_source = wei_gene_panel[['Name']]
gene_table_with_source.columns = ['Source']
gene_table_with_source.head()

Unnamed: 0,Source
ENSMUSG00000029705.17,MajorType
ENSMUSG00000026872.18,MajorType
ENSMUSG00000042992.15,MajorType
ENSMUSG00000062209.15,MajorType
ENSMUSG00000026872.18,MajorType


## aDMG

In [8]:
admgs_8wk_9mo_dict = joblib.load('celltype_admgs_8wk_18mo.dict')
admgs_8wk_9mo_18mo_dict = joblib.load('celltype_admgs_8wk_9mo_18mo.dict')

In [9]:
all_admgs_8wk_18mo = []
all_admgs_8wk_9mo_18mo = []

for ct, genes in admgs_8wk_9mo_dict.items():
    all_admgs_8wk_18mo += genes

for ct, genes in admgs_8wk_9mo_18mo_dict.items():
    all_admgs_8wk_9mo_18mo += genes

all_admgs_8wk_18mo = list(set(all_admgs_8wk_18mo))
all_admgs_8wk_9mo_18mo = list(set(all_admgs_8wk_9mo_18mo))

In [10]:
celltype_admg = list(set(all_admgs_8wk_18mo + all_admgs_8wk_18mo))
len(celltype_admg)

111

In [9]:
all_panel = list(set(celltype_admg + wei_gene_panel.index.tolist()))
len(all_panel)

319

In [10]:
tmp = pd.DataFrame(index = celltype_admg)
tmp['Source'] = 'aDMG'

In [11]:
gene_table_with_source = pd.concat([gene_table_with_source, tmp])

## canonical marker

In [7]:
gene_meta = pd.read_csv("../GeneMetadata.csv.gz", index_col='gene_name')
#gene_id_to_name = gene_meta["gene_name"].to_dict()
gene_name_to_id = gene_meta["geneslop2k"].to_dict()

In [8]:
genes_names_to_include = ["Slc17a6","Slc17a7","Gad1","Gad2","Prex1","Cux2",
                  "Rorb","Ptgfrn","Xkr7","Tcerg1l","Tle4","Tshz2",
                  "Galnt10","Rerg","Fibcd1","Dock10","Lhx6","Lamp5",
                  "Vip","Sst","Pvalb","Foxp2","Drd1","Drd2","Mobp",
                  "Pdgfra","Csf1r","Cspg4","Olig2","Olig1","Rorb","Slc1a2","Aqp4","Ctss"]
len(genes_names_to_include)

34

In [9]:
genes_to_include = [gene_name_to_id[gene_name] for gene_name in genes_names_to_include]

In [11]:
celltype_markers = list(set(wei_gene_panel.index.tolist() + genes_to_include))
len(celltype_markers)

241

In [13]:
joblib.dump(celltype_markers, 'celltype_markers.list')

['celltype_markers.list']

In [15]:
tmp = pd.DataFrame(index = celltype_admg)
tmp['Source'] = 'Canonical Marker'

In [16]:
gene_table_with_source = pd.concat([gene_table_with_source, tmp])
gene_table_with_source.head()

Unnamed: 0,Source
ENSMUSG00000029705.17,MajorType
ENSMUSG00000026872.18,MajorType
ENSMUSG00000042992.15,MajorType
ENSMUSG00000062209.15,MajorType
ENSMUSG00000026872.18,MajorType


In [17]:
all_gene_panel2 = list(set(all_panel + genes_to_include))
len(all_gene_panel2)

343

## SASP genes

In [18]:
luisa_panel = pd.read_csv("../230410_panel_design_v1/published_panel/luisa.csv", index_col=0)

In [19]:
sasp_genes = luisa_panel[
    luisa_panel["type of marker"].isin(["age/sasp"])
].index.tolist()

sasp_genes.append('Il33')

In [20]:
sasp_genes_ids = []
for genea_name in sasp_genes:
    if genea_name in gene_name_to_id.keys():
        sasp_genes_ids.append(gene_name_to_id[genea_name])
len(sasp_genes_ids)        

43

In [21]:
tmp = pd.DataFrame(index = sasp_genes_ids)
tmp['Source'] = 'SASP'

In [22]:
gene_table_with_source = pd.concat([gene_table_with_source, tmp])
gene_table_with_source.head()

Unnamed: 0,Source
ENSMUSG00000029705.17,MajorType
ENSMUSG00000026872.18,MajorType
ENSMUSG00000042992.15,MajorType
ENSMUSG00000062209.15,MajorType
ENSMUSG00000026872.18,MajorType


In [23]:
all_gene_panel3 = list(set(all_gene_panel2 + sasp_genes_ids))
len(all_gene_panel3)

385

## hotspot genes

In [24]:
_dir = '/data/aging/analysis/221027_dmr/hotspot/5000_result'

In [25]:
hotspot_genes = pd.read_csv(f'{_dir}/5000_inter_genebody_2k.bed', sep = '\t', header = None, index_col = 3)
hotspot_genes.index = hotspot_genes.index.map(gene_name_to_id)
hotspot_genes = hotspot_genes.loc[list(set(use_genes)& set(hotspot_genes.index))]

In [26]:
hotspot_genes = hotspot_genes[7]
hotspot_genes = hotspot_genes.groupby(hotspot_genes.index).sum()

In [28]:
hotspot_genes = hotspot_genes.sort_values(ascending = False)[:50].index.tolist()
len(hotspot_genes)

50

In [29]:
tmp = pd.DataFrame(index = sasp_genes_ids)
tmp['Source'] = 'Hotspot'

gene_table_with_source = pd.concat([gene_table_with_source, tmp])
gene_table_with_source.head()

In [31]:
all_gene_panel4 = list(set(hotspot_genes + all_gene_panel3))
len(all_gene_panel4)

430

## add immune genes

In [33]:
immune_markers = ['Ptprc','Cd3d','Cd69','Cd19','Cd8a','Cd4','Cx3cr1','Mertk']
immune_markers_id = [gene_name_to_id[gene_name] for gene_name in immune_markers]

In [34]:
all_gene_panel5 = list(set(immune_markers_id + all_gene_panel4))
len(all_gene_panel5)

438

In [43]:
tmp = pd.DataFrame(index = immune_markers_id)
tmp['Source'] = 'immune'

gene_table_with_source = pd.concat([gene_table_with_source, tmp])

## AD genes

In [36]:
ad_markers = ['Trem2','Nlrp3','Il18','Il1b','Apoe','App','Tff1','Psen1']
# hippocuss, enthorinal cortex
ad_markers_id = [gene_name_to_id[gene_name] for gene_name in ad_markers]

In [37]:
all_gene_panel6 = list(set(ad_markers_id + all_gene_panel5))
len(all_gene_panel6)

444

In [38]:
tmp = pd.DataFrame(index = ad_markers_id)
tmp['Source'] = 'AD'

gene_table_with_source = pd.concat([gene_table_with_source, tmp])

## methylation genes

In [None]:
methyl_genes = ['Dnmt3a','Dnmt3b','Tet1','Tet2','Tet3','Mecp2']

In [None]:
# xiaowei paper
# 


## assert length > 1000

In [39]:
final_gene_panel = list(set(use_genes) & set(all_gene_panel6))
len(final_gene_panel)

434

In [74]:
gene_table_with_source.to_csv('Gene_Panel.V2.csv')

In [60]:
joblib.dump(all_gene_panel4, 'all_gene_pane.list')

['all_gene_pane.list']