In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
ad = sc.read_text("exprMatrix.tsv.gz")

In [2]:
meta = pd.read_csv("meta.tsv", sep="\t")
ad.var = meta
print(ad)

AnnData object with n_obs × n_vars = 16774 × 235121
    var: 'V1', 'Cluster', 'Sample', 'Line', 'Protocol', 'Age', 'iPSCorhESC', 'Class', 'State', 'Type', 'Subtype'


First, we find the list of cluster names:

In [5]:
subtypeNameSet = set(ad.var['Subtype'])
print('List of Subtype Names:')
print(subtypeNameSet)
#print(ad.obs.index)

List of Subtype Names:
{'DeepLayer', 'Astrocyte', 'panNeuron', 'lowquality', 'Interneuron', 'glycolyticRG', 'glycolyticneurons', 'hindbrainAstrocyte', 'Unknown', 'hindbrainRG', 'earlyRG', 'Newborn', 'UpperLayer', 'Outlier', 'panRG', 'MatureIPC'}


Second, we need a way to get the average expression value for a gene for all cells in a subtype:

In [12]:
def getAvgExprForGeneInSubtype(gene, subtype):
    exprFrameForSubtype = ad[ad.obs.index == gene, ad.var['Subtype'] == subtype].X
    return exprFrameForSubtype.mean()

#print(getAvgExprForGeneInSubtype('SOX2|SOX2', 'glycolyticRG'))
#print(ad[ad.obs.index == 'SOX2|SOX2', ad.var['Subtype'] == 'glycolyticRG'].X.tolist())

Third, we combine the average expression values for a gene for a subtype in a dataframe:

In [13]:
geneList = ['AQP4', 'SLC1A3', 'HepaCAM1', 'CD44', 'NCAM1', 'CD24', 'FUT4', \
               'CXCR4', 'FOXO4', 'PDGFRA', 'ITGB2', 'TFRC', 'PROM1', 'NKX2-2']

# gene list corrected by hand using getGeneNameVariantList
correctedGeneList = ['AQP4|AQP4', 'SLC1A3|SLC1A3', 'CD44|CD44', 'NCAM1|NCAM1', 'CD24|CD24', \
                     'FUT4|FUT4', 'CXCR4|CXCR4', 'FOXO4|FOXO4', 'PDGFRA|PDGFRA', 'TFRC|TFRC', \
                     'PROM1|PROM1', 'NKX2-2|NKX2-2']

dataFrame = pd.DataFrame(np.array([[getAvgExprForGeneInSubtype(gene, subtype) \
                                    for gene in correctedGeneList] for subtype in subtypeNameSet]), \
                                    columns = correctedGeneList, index=subtypeNameSet)

print(dataFrame)
dataFrame.to_csv('averageExpressionPerCluster.csv')

                    AQP4|AQP4  SLC1A3|SLC1A3  CD44|CD44  NCAM1|NCAM1  \
DeepLayer            0.000135       0.050407   0.002495     0.777922   
Astrocyte            0.000595       0.173420   0.051498     0.238672   
panNeuron            0.000039       0.025129   0.003257     0.548654   
lowquality           0.000000       0.077205   0.013084     0.099289   
Interneuron          0.003072       0.039280   0.000000     0.345889   
glycolyticRG         0.000000       0.145575   0.019530     0.248365   
glycolyticneurons    0.000183       0.041714   0.005226     0.220222   
hindbrainAstrocyte   0.000000       0.070035   0.077094     0.136538   
Unknown              0.006943       0.059387   0.000000     0.630887   
hindbrainRG          0.000000       0.031407   0.009649     0.188841   
earlyRG              0.000000       0.168591   0.008262     0.159442   
Newborn              0.000000       0.075730   0.007131     0.527963   
UpperLayer           0.000054       0.040705   0.002923     0.65

The following are tools to help with gene name selection for the list of target genes:

The function below returns a list of variants of a gene name found in the dataset. It checks for substrings both ways, in a case insensitive way. One tuple with a match of NONE is returned in the list if no match is found. No attempt is made to filter out gene names that are substrings of one another without being related.

In [14]:
#dataset at https://cells.ucsc.edu/?ds=organoidreportcard contains gene names
#in the form GENENAME|GENENAME and makes the comparison unhelpful
#when searching for the variable as a substring in the target gene.
def getGeneVariantList(targetGene):
    geneVariantList = []
    for variable in ad.obs.index:
        if (targetGene.lower() in variable.lower() \
            or variable.lower() in targetGene.lower()):
            geneVariantList.append((targetGene, variable))
                
    if targetGene not in [v[0] for v in geneVariantList]:
        geneVariantList.append((targetGene, 'NONE'))
        
    return geneVariantList
        
print(getGeneVariantList('HepaCAM1'))
print(getGeneVariantList('HepaCAM'))

[('HepaCAM1', 'NONE')]
[('HepaCAM', 'HEPACAM|HEPACAM')]


In the block below, a dictionary of lists of tuples is returned where genes in the target list are variants of the names in the dataset, as per getGeneVariantList.

In [None]:
geneVariantDict = {}
for targetGene in geneList:
    geneVariantDict[targetGene] = getGeneVariantList(targetGene)

for key in allGeneVariants.keys():
    print(geneVariantDict[key])
    
#print(allGeneVariants)