In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
ad = sc.read_text("exprMatrix.tsv.gz")
meta = pd.read_csv("meta.tsv", sep="\t")
ad.var = meta

In [3]:
print(ad)

AnnData object with n_obs × n_vars = 2000 × 190022
    var: 'cellId', 'nCount_RNA', 'nFeature_RNA', 'Cluster', 'Cluster_name', 'Annotation', 'Dataset', 'Protocol', 'Age'


First, we find the list of cluster names:

In [4]:
clusterNameSet = set(ad.var['Cluster_name'])
print('List of Cluster Names:')
print(clusterNameSet)

List of Cluster Names:
{'IN', 'UPRC2', 'NEC4', 'GPC', 'CN3', 'CBC', 'OPC/OL', 'PGC2', 'CN2', 'NEC3', 'AS2', 'CN1', 'AS3', 'NEC2', 'NEC1', 'PGC1', 'AS1', 'Neuron', 'Inter', 'CN4', 'UPRC1', 'CN5', 'BRC', 'ME'}


Second, we need a way to get the average expression value for a gene for all cells in a cluster:

In [None]:
def getAvgExprForGeneInCluster(gene, cluster):
    exprFrameForCluster = ad[ad.obs.index == gene, ad.var['Cluster_name'] == cluster].X
    return exprFrameForCluster.mean() if not exprFrameForCluster.size == 0 else 0

print(getAvgExprForGeneInCluster('SOX2','CN3'))
#print(ad[ad.obs.index == 'SOX2', ad.var['Cluster_name'] == 'CN3'].X.tolist())

Third, we combine the average expression values for a gene for a cluster in a dataframe:

In [9]:
geneList = ['AQP4', 'SCL1A3', 'HepaCAM1', 'CD44', 'NCAM1', 'CD24', 'FUT4', \
               'CXCR4', 'FOXO4', 'PDGFRA', 'ITGB2', 'TFRC', 'PROM1', 'NKX2-2']

dataFrame = pd.DataFrame(np.array([[getAvgExprForGeneInCluster(gene, cluster) \
                                    for gene in geneList] for cluster in clusterNameSet]), \
                                    columns = geneList, index=clusterNameSet)

print(dataFrame)
#dataFrame.to_csv('averageExpressionPerCluster.csv')

            AQP4  SCL1A3  HepaCAM1      CD44     NCAM1  CD24  FUT4     CXCR4  \
IN      0.007937     0.0       0.0  0.042097  0.708314   0.0   0.0  0.296511   
UPRC2  -0.009971     0.0       0.0  0.373038  0.137948   0.0   0.0  0.312741   
NEC4    0.028726     0.0       0.0  0.228243  0.318951   0.0   0.0  0.959562   
GPC     0.025089     0.0       0.0  0.283512  0.213949   0.0   0.0  1.072408   
CN3    -0.001287     0.0       0.0  0.018976  0.795232   0.0   0.0  0.171675   
CBC     0.007456     0.0       0.0  0.086671  0.081521   0.0   0.0  0.230967   
OPC/OL  0.026053     0.0       0.0  0.167342  0.301977   0.0   0.0  0.896397   
PGC2   -0.002075     0.0       0.0  0.291515  0.158913   0.0   0.0  0.689419   
CN2     0.006827     0.0       0.0  0.012715  0.964154   0.0   0.0  0.103462   
NEC3    0.039304     0.0       0.0  0.368616  0.260555   0.0   0.0  0.918967   
AS2     0.057916     0.0       0.0  0.485789  0.172848   0.0   0.0  1.033635   
CN1     0.000362     0.0       0.0  0.01

The function below returns a boolean, to indicate whether a gene is present in the observations or not.

In [12]:
def searchGene(gene):
    return gene in ad.obs.index

print(searchGene('SCL1A3'))

False


Below, given a list of genes, the ones that are not present in the observations will be returned.

In [13]:
def getAbsentGenes(geneList):
    absentGeneList = []
    for gene in geneList:
        if gene not in ad.obs.index:
            absentGeneList.append(gene)
            
    return absentGeneList

print(getAbsentGenes(geneList))

['SCL1A3', 'HepaCAM1', 'CD24', 'FUT4', 'FOXO4', 'ITGB2', 'TFRC', 'PROM1', 'NKX2-2']
