In [9]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter
from glob import glob
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
matplotlib.rcParams.update({'font.size': 22})

In [7]:
# get names of go terms from IDs
go_df = pd.read_csv("http://www.geneontology.org/doc/GO.terms_alt_ids", comment="!", sep="\t", 
                    names=['goid', 'goid2', 'name', 'aspect', 'sdfsD'])
go_name = {x.goid: x['name'] for _,x in go_df.iterrows()}
go_df.head()

Unnamed: 0,goid,goid2,name,aspect,sdfsD
0,GO:0000001,,mitochondrion inheritance,P,
1,GO:0000002,,mitochondrial genome maintenance,P,
2,GO:0000003,GO:0019952 GO:0050876,reproduction,P,
3,GO:0000005,,ribosomal chaperone activity,F,obs
4,GO:0000006,,high affinity zinc uptake transmembrane transp...,F,


In [10]:
format_go = lambda go_id: go_id + "(" + str(go_name.get(go_id)) + ")" + "\t" + "http://amigo.geneontology.org/amigo/term/{}".format(go_id)
print(format_go("GO:0009279"))

GO:0009279(cell outer membrane)	http://amigo.geneontology.org/amigo/term/GO:0009279


In [47]:
# create gene sets using all GO terms (NOT goslim)
df = pd.read_csv("../my_annotations.gaf", sep="\t", comment="!", header=None)
df = df.rename(columns={1:"id", 2:"symbol", 4:"go_id", 8:'aspect', 12:"taxon"})
df = df.query("go_id != 'GO:0005575'")  # remove 'cellular component'
len(set(df.go_id))

1717

In [28]:
# these are the proteins we observed in our experiments
observed_pa = set(pd.read_csv("pa.rnk", names=["a"], sep="\t").index)
observed_bs = set(pd.read_csv("bs.rnk", names=["a"], sep="\t").index)
observed_jurkat = set(pd.read_csv("j.rnk", names=["a"], sep="\t").index)

In [21]:
uniprot_go_bs = defaultdict(set)
uniprot_go_pa = defaultdict(set)
uniprot_go_jurkat = defaultdict(set)
for _,row in df.iterrows():
    if row.taxon == "taxon:9606":
        uniprot_go_jurkat[row.id].add(row.go_id)
    elif row.taxon == 'taxon:224308':
        uniprot_go_bs[row.id].add(row.go_id)
    elif row.taxon == "taxon:208964":
        uniprot_go_pa[row.id].add(row.go_id)

In [34]:
# the average number of GO terms per protein (using proteins observed in this experiment)
print(np.mean([len(uniprot_go_jurkat.get(x,[])) for x in observed_jurkat]))
print(np.mean([len(uniprot_go_pa.get(x,[])) for x in observed_pa]))
print(np.mean([len(uniprot_go_bs.get(x,[])) for x in observed_bs]))

5.26446280992
0.954305799649
1.01935483871


In [39]:
print(sum([len(uniprot_go_jurkat.get(x,[]))==0 for x in observed_jurkat])/len(observed_jurkat))
print(sum([len(uniprot_go_pa.get(x,[]))==0 for x in observed_pa])/len(observed_pa))
print(sum([len(uniprot_go_bs.get(x,[]))==0 for x in observed_bs])/len(observed_bs))

0.4413223140495868
0.45606326889279436
0.46359447004608295


In [12]:
go_uniprot_bs = defaultdict(set)
go_uniprot_pa = defaultdict(set)
go_uniprot_jurkat = defaultdict(set)
for _,row in df.iterrows():
    if row.taxon == "taxon:9606":
        go_uniprot_jurkat[row.go_id].add(row.id)
    elif row.taxon == 'taxon:224308':
        go_uniprot_bs[row.go_id].add(row.id)
    elif row.taxon == "taxon:208964":
        go_uniprot_pa[row.go_id].add(row.id)

In [13]:
with open("genesets_bs.gmt", 'w') as f:
    for go, uniprots in go_uniprot_bs.items():
        print(format_go(go) + "\t" + "\t".join(uniprots), file=f)
with open("genesets_pa.gmt", 'w') as f:
    for go, uniprots in go_uniprot_pa.items():
        print(format_go(go) + "\t" + "\t".join(uniprots), file=f)
with open("genesets_jurkat.gmt", 'w') as f:
    for go, uniprots in go_uniprot_jurkat.items():
        print(format_go(go) + "\t" + "\t".join(uniprots), file=f)

In [14]:
# create gene sets using GOslim annotations
df = pd.read_csv("../annotations.mapped.gaf", sep="\t", comment="!", header=None)
df = df.rename(columns={1:"id", 4:"go_id", 8:'aspect', 12:"taxon"})
df = df.query("go_id != 'GO:0005575'")  # remove 'cellular component'
df = df[df.taxon.isin({'taxon:224308', 'taxon:9606', 'taxon:208964' })]
len(set(df.go_id))

5

In [15]:
go_uniprot_bs = defaultdict(set)
go_uniprot_pa = defaultdict(set)
go_uniprot_jurkat = defaultdict(set)
for _,row in df.iterrows():
    if row.taxon == "taxon:9606":
        go_uniprot_jurkat[row.go_id].add(row.id)
    elif row.taxon == 'taxon:224308':
        go_uniprot_bs[row.go_id].add(row.id)
    elif row.taxon == "taxon:208964":
        go_uniprot_pa[row.go_id].add(row.id)

In [16]:
with open("genesets_slim_bs.gmt", 'w') as f:
    for go, uniprots in go_uniprot_bs.items():
        print(format_go(go) + "\t" + "\t".join(uniprots), file=f)
with open("genesets_slim_pa.gmt", 'w') as f:
    for go, uniprots in go_uniprot_pa.items():
        print(format_go(go) + "\t" + "\t".join(uniprots), file=f)
with open("genesets_slim_jurkat.gmt", 'w') as f:
    for go, uniprots in go_uniprot_jurkat.items():
        print(format_go(go) + "\t" + "\t".join(uniprots), file=f)