In [1]:
%matplotlib notebook

import pandas as pd
import json
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("lgg_gbm_gene_exp.csv", index_col=0)

genes = json.load(open("genes.json"))

for gene in genes:
    df[gene] = pd.to_numeric(df[gene], errors="coerce")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
gcimp_p_labels = ["LGm1", "LGm2", "LGm3"]
gcimp_n_labels = ["LGm4", "LGm5", "LGm6"]

gbm = df[df["Study"]=="Glioblastoma multiforme"]
lgg = df[df["Study"]=="Brain Lower Grade Glioma"]

gcimp_p = df[df["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_p_labels)]
gcimp_n = df[df["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_n_labels)]

gbm_gcimp_p = gbm[gbm["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_p_labels)]
gbm_gcimp_n = gbm[gbm["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_n_labels)]

lgg_gcimp_p = lgg[lgg["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_p_labels)]
lgg_gcimp_n = lgg[lgg["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_n_labels)]

print "Total number of cases: ", len(df)
print
print "GBM number of cases: ", len(gbm)
print "LGG number of cases: ", len(lgg)
print
print "GCIMP+ number of cases: ", len(gcimp_p)
print "GCIMP- number of cases: ", len(gcimp_n)
print
print "GBM GCIMP+ number of cases: ", len(gbm_gcimp_p)
print "GBM GCIMP- number of cases: ", len(gbm_gcimp_n)
print "LGG GCIMP+ number of cases: ", len(lgg_gcimp_p)
print "LGG GCIMP- number of cases: ", len(lgg_gcimp_n)

Total number of cases:  632

GBM number of cases:  595
LGG number of cases:  27

GCIMP+ number of cases:  56
GCIMP- number of cases:  370

GBM GCIMP+ number of cases:  31
GBM GCIMP- number of cases:  368
LGG GCIMP+ number of cases:  25
LGG GCIMP- number of cases:  2


In [4]:
gcimp_p_stats = {}
for gene in genes:
    gcimp_p_stats[gene] = stats.ttest_ind(gbm_gcimp_p[gene], lgg_gcimp_p[gene])

significant_gcimp_p = [x for x in gcimp_p_stats.items() if x[1][1] <= 0.05]
significant_gcimp_p.sort(key=lambda x: x[1][1])
print "Number of significant genes for GCIMP+: ", len(significant_gcimp_p)
print "Top 10 significant genes:"
print [x[0] for x in significant_gcimp_p[:10]]

Number of significant genes for GCIMP+:  9662
Top 10 significant genes:
[u'CCDC115', u'PFDN6', u'ATG2A', u'RIC8A', u'CCDC71', u'HNRPCL1', u'LIMK1', u'MRFAP1', u'MAP3K7IP1', u'FAM134A']


In [5]:
gcimp_n_stats = {}
for gene in genes:
    gcimp_n_stats[gene] = stats.ttest_ind(gbm_gcimp_n[gene], lgg_gcimp_n[gene])

significant_gcimp_n = [x for x in gcimp_n_stats.items() if x[1][1] <= 0.05]
significant_gcimp_n.sort(key=lambda x: x[1][1])
print "Number of significant genes for GCIMP-: ", len(significant_gcimp_n)
print "Top 10 significant genes:"
print [x[0] for x in significant_gcimp_n[:10]]

Number of significant genes for GCIMP-:  2934
Top 10 significant genes:
[u'C1orf142', u'KCTD21', u'KIAA1183', u'MTCH1', u'CCDC115', u'CRY2', u'PARP1', u'FAM120B', u'SLC22A6', u'GSX1']


In [6]:
gcimp_p_genes = [x[0] for x in significant_gcimp_p]
gcimp_n_genes = [x[0] for x in significant_gcimp_n]
print len([x for x in gcimp_p_genes if x in gcimp_n_genes])
print [x for x in gcimp_p_genes if x in gcimp_n_genes][:10]
print [x for x in gcimp_n_genes if x in gcimp_p_genes][:10]

2473
[u'CCDC115', u'PFDN6', u'ATG2A', u'RIC8A', u'CCDC71', u'HNRPCL1', u'LIMK1', u'MRFAP1', u'MAP3K7IP1', u'FAM134A']
[u'C1orf142', u'KCTD21', u'KIAA1183', u'MTCH1', u'CCDC115', u'CRY2', u'PARP1', u'FAM120B', u'SLC22A6', u'GSX1']


In [7]:
g = pd.DataFrame([(x[0]) for x in significant_gcimp_p])
g.columns = ["gene"]
g.to_csv("gcimp_positive_genes.tsv", sep="\t", index=False)

In [15]:
t = df[genes][:10].transpose()
# t.to_csv("temp.txt", sep="\t")
t

Unnamed: 0,TCGA-DB-5281,TCGA-E1-5304,TCGA-CS-4944,TCGA-DB-5274,TCGA-DH-5140,TCGA-E1-5311,TCGA-CS-5390,TCGA-DB-5275,TCGA-DH-5141,TCGA-E1-5318
ELMO2,1.632562,0.942750,1.355813,0.818500,1.489000,1.485562,1.192000,1.363375,1.308062,0.959688
CREB3L1,-0.774833,-1.491167,-0.521500,0.262000,-1.025500,-0.169333,-0.334833,-1.146500,-0.173500,0.263167
RPS11,0.212200,0.990600,-0.521100,0.476700,0.547100,0.200700,0.546200,-0.300600,-0.498300,0.301100
PNMA1,3.255200,2.442200,2.242800,2.639400,2.600600,3.090600,3.241200,3.082400,3.024800,2.953600
MMP2,-1.648375,-1.084500,-0.766250,-1.469625,-0.916125,-1.077500,-0.724375,-1.387375,-1.316250,-0.822500
C10orf90,-0.182200,1.271200,0.918800,1.173400,0.398800,0.388800,0.638400,0.157600,1.465800,-0.681000
ZHX3,1.089500,1.429250,2.173250,1.253375,1.421750,1.701750,1.208375,1.888125,1.041375,0.743500
ERCC5,0.008800,0.058000,0.042800,0.364600,0.175000,0.382800,0.353400,0.974200,-0.213400,0.146200
GPR98,1.270833,0.157083,0.691583,1.010909,1.176667,1.473167,1.213273,1.842917,1.291500,-0.632083
RXFP3,0.438333,0.445333,0.787333,0.460667,0.280667,0.487667,0.076000,0.434667,0.384667,0.487333
