# Gene Statistics

Collect gene stats from all datasets.

'gene_counts.txt' contains the number of times that each gene appears (is expressed at least in one cell) in all datasets.

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
gene_counts = pd.read_csv('gene_counts.txt')

In [3]:
gene_counts.head()

Unnamed: 0,Gene,Count
0,SNX3,8
1,INMT,8
2,DHODH,8
3,MIEN1,8
4,EXD2,8


In [5]:
gene_counts[gene_counts['Gene']  == 'CD4']

Unnamed: 0,Gene,Count
8886,CD4,7


In [6]:
gene_counts[gene_counts['Gene']  == 'CD3G']

Unnamed: 0,Gene,Count
17208,CD3G,6


## Malignant Datasets

In [2]:
malignant_datasets = ['GSE102130.txt',
                         'GSE131928_SmartSeq2.txt',
                         'GSE89567.txt',
                         'GSE70630.txt',
                         'GSE131928_10x.txt',
                         'GSE57872.txt',
                         'GSE132172_GliNS2.txt',
                         'GSE103224.txt']
# GSE84465 not analyzed yet

In [3]:
os.chdir('clean_data')
datasets = {}
for dataset in malignant_datasets:
    dataset_name = dataset.split('.')[0]
    if dataset_name == 'GSE103224':
        datasets[dataset_name] = np.unique(pd.read_csv(dataset, sep='\t', usecols=[1]).values)
    else:
        datasets[dataset_name] = pd.read_csv(dataset, sep='\t', usecols=[0]).values.flatten()
        
os.chdir('..')

In [4]:
for name in datasets:
    print(name)
    print("n_genes: {}".format(datasets[name].shape[0]))
    print("unique_genes: {}".format(np.unique(datasets[name]).shape[0]))
    print()

GSE102130
n_genes: 22639
unique_genes: 22639

GSE131928_SmartSeq2
n_genes: 22814
unique_genes: 22814

GSE89567
n_genes: 22695
unique_genes: 22695

GSE70630
n_genes: 22556
unique_genes: 22556

GSE131928_10x
n_genes: 24557
unique_genes: 24557

GSE57872
n_genes: 5948
unique_genes: 5948

GSE132172_GliNS2
n_genes: 18299
unique_genes: 18299

GSE103224
n_genes: 58828
unique_genes: 58828



In [5]:
all_genes = pd.Series(np.concatenate([datasets[name] for name in datasets]).flatten())
gene_counts = all_genes.value_counts() 
gene_counts

TMEM219          8
ZNF770           8
UAP1             8
VPS29            8
SAR1A            8
                ..
RP1-66N13.1      1
CITF22-62D4.1    1
RNU6-1290P       1
RP11-326A13.1    1
RP11-570L15.1    1
Length: 61261, dtype: int64

## Pediatric Glioblastoma

In [6]:
all_genes_wo_pediatric = np.concatenate([datasets[name] for name in datasets if name != 'GSE102130']).flatten()

In [7]:
pediatric = datasets['GSE102130']

In [8]:
only_pediatric = np.array([gene for gene in pediatric if not gene in all_genes_wo_pediatric])

In [9]:
only_pediatric

array(['DUX2', 'GAGE7', 'MIR1206', 'MIR3202-1', 'MIR4678', 'SNORD114-8'],
      dtype='<U10')

## Non-malignant dataset

In [10]:
df = pd.read_csv('clean_data/GSE132172_CB660.txt', sep='\t', index_col=0)

In [11]:
mask = gene_counts.index.map(lambda i: not i in df.index)
gene_counts.index[mask]

Index(['SH3TC2', 'NKAIN4', 'SLC24A2', 'TMEM212', 'HNMT', 'HOXC9', 'PMP2',
       'CSGALNACT1', 'SNCAIP', 'TFCP2L1',
       ...
       'CTD-2027I19.2', 'RP11-69M1.4', 'AP003176.1', 'AC009196.1',
       'RP11-272M24.2', 'RP1-66N13.1', 'CITF22-62D4.1', 'RNU6-1290P',
       'RP11-326A13.1', 'RP11-570L15.1'],
      dtype='object', length=43693)

In [20]:
gene_only_in_malignant = gene_counts.index[mask]

In [24]:
gene_only_non_malignant = df.index[df.index.map(lambda i: not i in gene_counts.index)]

In [29]:
gene_only_in_malignant.map(lambda i: i in gene_only_non_malignant).any()

False