In [1]:
%matplotlib notebook

import pandas as pd
import json
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("lgg_gbm_gene_exp.csv", index_col=0)

genes = json.load(open("genes.json"))

for gene in genes:
    df[gene] = pd.to_numeric(df[gene], errors="coerce")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
gcimp_p_labels = ["LGm1", "LGm2", "LGm3"]
gcimp_n_labels = ["LGm4", "LGm5", "LGm6"]

gbm = df[df["Study"]=="Glioblastoma multiforme"]
lgg = df[df["Study"]=="Brain Lower Grade Glioma"]

gcimp_p = df[df["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_p_labels)]
gcimp_n = df[df["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_n_labels)]

gbm_gcimp_p = gbm[gbm["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_p_labels)]
gbm_gcimp_n = gbm[gbm["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_n_labels)]

lgg_gcimp_p = lgg[lgg["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_p_labels)]
lgg_gcimp_n = lgg[lgg["Pan-Glioma DNA Methylation Cluster"].isin(gcimp_n_labels)]

print "Total number of cases: ", len(df)
print
print "GBM number of cases: ", len(gbm)
print "LGG number of cases: ", len(lgg)
print
print "GCIMP+ number of cases: ", len(gcimp_p)
print "GCIMP- number of cases: ", len(gcimp_n)
print
print "GBM GCIMP+ number of cases: ", len(gbm_gcimp_p)
print "GBM GCIMP- number of cases: ", len(gbm_gcimp_n)
print "LGG GCIMP+ number of cases: ", len(lgg_gcimp_p)
print "LGG GCIMP- number of cases: ", len(lgg_gcimp_n)

Total number of cases:  632

GBM number of cases:  595
LGG number of cases:  27

GCIMP+ number of cases:  56
GCIMP- number of cases:  370

GBM GCIMP+ number of cases:  31
GBM GCIMP- number of cases:  368
LGG GCIMP+ number of cases:  25
LGG GCIMP- number of cases:  2


In [19]:
gcimp_p_stats = {}
for gene in genes:
    gcimp_p_stats[gene] = stats.ttest_ind(lgg_gcimp_p[gene], gbm_gcimp_p[gene])

significant_gcimp_p = [x for x in gcimp_p_stats.items() if x[1][1] <= 0.05]
significant_gcimp_p.sort(key=lambda x: x[1][1])
print "Number of significant genes for GCIMP+: ", len(significant_gcimp_p)
print "Top 10 significant genes:"
for x in significant_gcimp_p[:10]:
    print x[0], x[1]

Number of significant genes for GCIMP+:  9662
Top 10 significant genes:
CCDC115 Ttest_indResult(statistic=17.046911877750279, pvalue=2.1703374869474671e-23)
PFDN6 Ttest_indResult(statistic=-16.964529476237626, pvalue=2.707564291620078e-23)
ATG2A Ttest_indResult(statistic=16.492772817322621, pvalue=9.7469665290167026e-23)
RIC8A Ttest_indResult(statistic=15.179902479007238, pvalue=3.9352311052902408e-21)
CCDC71 Ttest_indResult(statistic=-15.107253878450079, pvalue=4.8575099781921553e-21)
HNRPCL1 Ttest_indResult(statistic=15.040818708991527, pvalue=5.8921991542475716e-21)
LIMK1 Ttest_indResult(statistic=-14.4274825771683, pvalue=3.5934641739364096e-20)
MRFAP1 Ttest_indResult(statistic=14.424689845981744, pvalue=3.623552650271319e-20)
MAP3K7IP1 Ttest_indResult(statistic=14.231457832673192, pvalue=6.4671340782550935e-20)
FAM134A Ttest_indResult(statistic=14.17670425372977, pvalue=7.6271841231513862e-20)


In [27]:
print lgg_gcimp_p["CRY2"][:10]
print gbm_gcimp_p["CRY2"][:10]

TCGA-DB-5281    1.401875
TCGA-E1-5304    0.405313
TCGA-CS-4944    1.518250
TCGA-DB-5274    1.545750
TCGA-E1-5311    1.830875
TCGA-CS-5390    1.886062
TCGA-DB-5275    1.428625
TCGA-DH-5141    2.457562
TCGA-E1-5318    1.728625
TCGA-CS-5396    1.920438
Name: CRY2, dtype: float64
TCGA-02-0080    0.774250
TCGA-02-0058    0.608250
TCGA-02-0114   -0.460500
TCGA-06-0128    0.029188
TCGA-02-0010   -0.565625
TCGA-02-0014   -0.065750
TCGA-02-0028   -0.275313
TCGA-06-0129    0.122750
TCGA-16-0849    0.712063
TCGA-16-0850   -0.080625
Name: CRY2, dtype: float64


In [21]:
gcimp_n_stats = {}
for gene in genes:
    gcimp_n_stats[gene] = stats.ttest_ind(lgg_gcimp_n[gene], gbm_gcimp_n[gene])

significant_gcimp_n = [x for x in gcimp_n_stats.items() if x[1][1] <= 0.05]
significant_gcimp_n.sort(key=lambda x: x[1][1])
print "Number of significant genes for GCIMP-: ", len(significant_gcimp_n)
print "Top 10 significant genes:"
for x in significant_gcimp_n[:10]:
    print x[0], x[1]

Number of significant genes for GCIMP-:  2934
Top 10 significant genes:
C1orf142 Ttest_indResult(statistic=7.9347894135828634, pvalue=2.5658328030507939e-14)
KCTD21 Ttest_indResult(statistic=7.4536339759674961, pvalue=6.5559063621729672e-13)
KIAA1183 Ttest_indResult(statistic=6.8931664340087693, pvalue=2.3814286955993823e-11)
MTCH1 Ttest_indResult(statistic=6.4790862251574213, pvalue=2.963409345788531e-10)
CCDC115 Ttest_indResult(statistic=6.4718222653937829, pvalue=3.0941883228076149e-10)
CRY2 Ttest_indResult(statistic=6.0775329114436811, pvalue=3.0514838780710436e-09)
PARP1 Ttest_indResult(statistic=5.959240745579427, pvalue=5.9339845411766576e-09)
FAM120B Ttest_indResult(statistic=5.8488642483325757, pvalue=1.0936558185427181e-08)
SLC22A6 Ttest_indResult(statistic=5.7765191120970298, pvalue=1.6248865125531701e-08)
GSX1 Ttest_indResult(statistic=5.7658114648016259, pvalue=1.7223834197019406e-08)


In [6]:
gcimp_p_genes = [x[0] for x in significant_gcimp_p]
gcimp_n_genes = [x[0] for x in significant_gcimp_n]
print len([x for x in gcimp_p_genes if x in gcimp_n_genes])
print [x for x in gcimp_p_genes if x in gcimp_n_genes][:10]
print [x for x in gcimp_n_genes if x in gcimp_p_genes][:10]

2473
[u'CCDC115', u'PFDN6', u'ATG2A', u'RIC8A', u'CCDC71', u'HNRPCL1', u'LIMK1', u'MRFAP1', u'MAP3K7IP1', u'FAM134A']
[u'C1orf142', u'KCTD21', u'KIAA1183', u'MTCH1', u'CCDC115', u'CRY2', u'PARP1', u'FAM120B', u'SLC22A6', u'GSX1']


In [16]:
shared = list(set(gcimp_p_genes).intersection(set(gcimp_n_genes)))
for g in shared:
    print g

PGM2L1
SFRS2IP
HSPA6
ASCC3L1
HIST1H4K
HIST1H4J
NIPA1
HIST1H4H
ATXN10
RNF11
HMG1L1
CAMK1
STK25
ZC3H10
HMGCLL1
C16orf14
DHX9
OSGEP
SIRPA
XPC
MUC1
FAM84B
ZDHHC18
MEG3
RAB40C
RAB40B
ZNF677
PRRG1
PRRG3
CLEC2B
ZNF678
SPPL2A
CTBP2
SLC46A1
C10orf25
UGCG
ALDH3A2
ITGA1
SUGT1L1
ITGA5
MGST2
LOC51035
GTF3C1
EDC4
IQSEC1
SEMA4B
SEMA4C
C15orf17
XPO6
TIPRL
HRH3
FBXL16
KSR2
KCNIP3
KCNIP2
FAM123C
GRP
HMGCS1
GTF2IRD1
C14orf2
C20orf43
COL4A2
COL4A1
CHST1
ANAPC2
ANAPC7
NUMA1
FLJ22662
RPS20
RPS6KA5
MDP-1
BCL2A1
PISD
SFT2D2
NOG
PSMD10
PSMD13
TCIRG1
GIT1
UTP20
FAM134A
C9orf114
SMAD9
NUP133
SMAD6
HNRPH3
NPHP3
ORC2L
AMMECR1L
NAPG
PELI3
ARF1
CYFIP2
SPARC
NSBP1
ERICH1
KIAA1183
C6orf49
TCEA1
COX4I1
SH2B3
SH2B1
HTF9C
PQLC3
SRPK2
LOR
RAB11FIP1
RAB11FIP4
FAM20A
RLF
DCAKD
CXXC1
SPP1
CLPTM1
C17orf39
SPHK2
EEF1A1
HMGB1
FOXO4
ABAT
ZNF780B
OR6B1
GCHFR
ZNF41
WDR45L
UBTF
UGCGL1
UGCGL2
ETNK2
SLIT1
FIP1L1
NDUFA4L2
PEX16
HEXDC
NUP93
CA13
CA11
ATG4B
SASH1
TCL1B
THAP2
EIF4E2
GPT2
RAB28
THAP9
FLJ10213
CALU
CYB5B
WDR79
ATP6V0E2
SPG

In [7]:
g = pd.DataFrame([(x[0]) for x in significant_gcimp_p])
g.columns = ["gene"]
g.to_csv("gcimp_positive_genes.tsv", sep="\t", index=False)

In [15]:
t = df[genes][:10].transpose()
# t.to_csv("temp.txt", sep="\t")
t

Unnamed: 0,TCGA-DB-5281,TCGA-E1-5304,TCGA-CS-4944,TCGA-DB-5274,TCGA-DH-5140,TCGA-E1-5311,TCGA-CS-5390,TCGA-DB-5275,TCGA-DH-5141,TCGA-E1-5318
ELMO2,1.632562,0.942750,1.355813,0.818500,1.489000,1.485562,1.192000,1.363375,1.308062,0.959688
CREB3L1,-0.774833,-1.491167,-0.521500,0.262000,-1.025500,-0.169333,-0.334833,-1.146500,-0.173500,0.263167
RPS11,0.212200,0.990600,-0.521100,0.476700,0.547100,0.200700,0.546200,-0.300600,-0.498300,0.301100
PNMA1,3.255200,2.442200,2.242800,2.639400,2.600600,3.090600,3.241200,3.082400,3.024800,2.953600
MMP2,-1.648375,-1.084500,-0.766250,-1.469625,-0.916125,-1.077500,-0.724375,-1.387375,-1.316250,-0.822500
C10orf90,-0.182200,1.271200,0.918800,1.173400,0.398800,0.388800,0.638400,0.157600,1.465800,-0.681000
ZHX3,1.089500,1.429250,2.173250,1.253375,1.421750,1.701750,1.208375,1.888125,1.041375,0.743500
ERCC5,0.008800,0.058000,0.042800,0.364600,0.175000,0.382800,0.353400,0.974200,-0.213400,0.146200
GPR98,1.270833,0.157083,0.691583,1.010909,1.176667,1.473167,1.213273,1.842917,1.291500,-0.632083
RXFP3,0.438333,0.445333,0.787333,0.460667,0.280667,0.487667,0.076000,0.434667,0.384667,0.487333


In [98]:
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
from sklearn.preprocessing import Imputer
import numpy as np

data = pd.concat([gcimp_p, gcimp_n])
print len(data)
x = data[shared].copy()
y = data['Study'].copy()
y[y == "Brain Lower Grade Glioma"] = 0
y[y == "Glioblastoma multiforme"] = 1
y = pd.to_numeric(y)
# y = y.as_matrix()

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
x = imp.fit(x).transform(x)
print y[np.isnan(y)]
print df.loc["TCGA-06-0673"]["Study"]

# y[y > 0] = 1

model = LogisticRegression()
model = model.fit(x, y)

print 'score: ', model.score(x, y)
# print 'prediction :', model.predict()

conversion_weights = zip(model.coef_[0], shared)
conversion_weights.sort()
print list(reversed(conversion_weights[-10:]))
print conversion_weights[:10]

# abs_minimum_conversion = abs(conversion_weights[0][0])
# normalization_dividend_conversion = abs_minimum_conversion + conversion_weights[-1][0]
# conversion_weights = [(((n + abs_minimum_conversion) / normalization_dividend_conversion), k) for n, k in conversion_weights]
# print conversion_weights
print gbm_gcimp_p["MAP2K7"][:10]
print lgg_gcimp_p["MAP2K7"][:10]

426
Series([], Name: Study, dtype: int64)
nan
score:  1.0
[(0.090168341646180358, u'TAF15'), (0.07274054492914396, u'CTSZ'), (0.06802295863447208, u'APP'), (0.060781455397133879, u'LIMK1'), (0.053908692803140409, u'CCDC71'), (0.052868219649504201, u'UBE2E1'), (0.052322463323526733, u'CA3'), (0.047151906730934751, u'C5orf5'), (0.045527890205355144, u'SLC39A7'), (0.044601829029405238, u'MAP2K7')]
[(-0.042277592888779386, u'VEPH1'), (-0.039424848054614381, u'TNIP2'), (-0.037023161548868533, u'UNC84B'), (-0.036256313098231338, u'PPP1R12C'), (-0.035993606181577618, u'CBLN1'), (-0.035753688452940025, u'SFRP2'), (-0.03459041380877699, u'DOHH'), (-0.034168187402293944, u'NUMA1'), (-0.033289816916384156, u'AKR1B10'), (-0.032834207647177052, u'STIP1')]
TCGA-02-0080    1.4442
TCGA-02-0058    0.6122
TCGA-02-0114    2.1178
TCGA-06-0128    1.8082
TCGA-02-0010    1.3130
TCGA-02-0014    1.2146
TCGA-02-0028    0.6346
TCGA-06-0129    1.6732
TCGA-16-0849    1.5306
TCGA-16-0850    0.9364
Name: MAP2K7, dty