In [1]:
import sys
import os
from itertools import chain
from collections import defaultdict
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('precision', 3)
pd.set_option('display.max_colwidth', -1)
import gseapy as gp
import goatools

goa = goatools.obo_parser.GODag('go-basic.obo')

load obo file go-basic.obo
go-basic.obo: fmt(1.2) rel(2018-01-12) 49,288 GO Terms


In [2]:
sys.path.insert(0, "/home/gstupp/projects/metaproteomics")
from metaproteomics import utils
#from metaproteomics.analysis import build_loci

BASE = 'out/'
grouped_loci = utils.load(os.path.join(BASE,"grouped_loci_filt_annot.pkl.gz"))

In [3]:
grouped_loci[0].annotations

{'Gene3D': {'G3DSA:1.10.285.10', 'G3DSA:3.40.192.10', 'G3DSA:3.40.50.720'},
 'PANTHER': {'PTHR11606', 'PTHR11606:SF4'},
 'PIRSF': {'PIRSF000185'},
 'PRINTS': {'PR00082'},
 'Pfam': {'PF00208', 'PF02812'},
 'SMART': {'SM00839'},
 'SUPERFAMILY': {'SSF51735', 'SSF53223'},
 'go': {'GO:0006520', 'GO:0016491', 'GO:0016639', 'GO:0055114'},
 'ipa': {'IPR006095', 'IPR006096', 'IPR006097', 'IPR014362', 'IPR016040'}}

In [4]:
# i need to map to slim, and can only do it with a gaf file
# so im making a fake gaf file with all go terms in it, then ill extract the go-> goslim mappings...
with open("all_go.gaf", "w") as f:
    print("!gaf-version: 2.0", file=f)
    for x in goa.values():
        go = x.id
        line = "\t".join(["UniProtKB", go, go, "", go, "zsrg", "sdg", 
                          "", "X", "", "", "protein", "taxon:9606", "20090118", "SGD", "", ""])
        print(line, file=f)    

In [5]:
go_cats = set(map(str.strip,open("idfile_slim.txt").readlines()))
go_cats

{'GO:0005575',
 'GO:0005623',
 'GO:0005737',
 'GO:0016020',
 'GO:0043226',
 'GO:0071944'}

In [None]:
! ~/bin/owltools/OWLTools-Runner/bin/owltools go.obo --gaf all_go.gaf --map2slim --idfile idfile_slim.txt --write-gaf all_go.mapped.gaf

In [6]:
go_mapping = defaultdict(set)
go_mapping_df = pd.read_csv("all_go.mapped.gaf", comment="!", sep="\t", header=None)
for _, row in go_mapping_df.iterrows():
    go_mapping[row[1]].add(row[4])
go_mapping = dict(go_mapping)

In [7]:
list(go_mapping.items())[:5]

[('GO:0031521', {'GO:0005737', 'GO:0016020', 'GO:0043226', 'GO:0071944'}),
 ('GO:0014057', {'GO:0005623'}),
 ('GO:1903335', {'GO:0005623'}),
 ('GO:0008333', {'GO:0005623'}),
 ('GO:1990513', {'GO:0005623', 'GO:0043226'})]

In [13]:
def get_go_count(loci):
    """
    if a protein has multiple go terms, give those go terms the number of spectrral counts/# Go terms
    """
    samples = {'H1-PelletTriflic-Fl-R1', 'H1-PelletTriflic-Fl-R2', 'H1-PelletTriflic-Fl-R3',
               'H1-SonicSupTriflic-V2-R1', 'H1-SonicSupTriflic-V2-R2', 'H1-SonicSupTriflic-V2-R3',
               'H1-wholetriflic-R1', 'H1-wholetriflic-R2', 'H1-wholetriflic-R3'}
    d = {s: defaultdict(int) for s in samples}
    for locus in loci:
        # map to slim
        go_terms = locus.annotations.get("go", {})
        if go_terms:
            this_go = set(chain(*[go_mapping[x] for x in go_terms if x in go_mapping]))
        else:
            this_go = {'unknown'}
        # score
        for go in this_go:
            for sample,count in locus.quantification.items():
                d[sample][go]+=count/len(this_go)
    d = {k:dict(v) for k,v in d.items()}
    return d

In [16]:
df = pd.DataFrame(get_go_count(grouped_loci))
df.index = df.index.map(lambda x:goa[x].name if x!='unknown' else x)
df

Unnamed: 0,H1-PelletTriflic-Fl-R1,H1-PelletTriflic-Fl-R2,H1-PelletTriflic-Fl-R3,H1-SonicSupTriflic-V2-R1,H1-SonicSupTriflic-V2-R2,H1-SonicSupTriflic-V2-R3,H1-wholetriflic-R1,H1-wholetriflic-R2,H1-wholetriflic-R3
cellular_component,310.167,356.833,303.0,257.0,244.0,213.0,329.5,323.0,287.167
cell,7015.0,6850.5,6812.5,2305.833,2241.667,2263.833,6044.5,5795.5,5590.0
cytoplasm,6662.333,6718.167,7007.667,2053.833,2216.333,1872.667,6622.333,6347.667,5706.0
membrane,2058.0,2094.0,2037.167,2001.0,1895.5,1958.5,1819.167,1793.333,1655.667
organelle,3122.5,3099.0,3214.167,1225.0,1246.167,1268.667,2838.0,2853.5,2563.167
cell periphery,686.0,703.5,720.5,1008.333,994.333,1018.333,527.5,487.0,424.0
unknown,20706.0,20402.0,20361.0,31245.0,30214.0,29439.0,17889.0,16463.0,14565.0


In [17]:
df_norm = df.div(df.sum(0))
df_norm

Unnamed: 0,H1-PelletTriflic-Fl-R1,H1-PelletTriflic-Fl-R2,H1-PelletTriflic-Fl-R3,H1-SonicSupTriflic-V2-R1,H1-SonicSupTriflic-V2-R2,H1-SonicSupTriflic-V2-R3,H1-wholetriflic-R1,H1-wholetriflic-R2,H1-wholetriflic-R3
cellular_component,0.008,0.009,0.007,0.006,0.006,0.006,0.009,0.009,0.009
cell,0.173,0.17,0.168,0.058,0.057,0.06,0.168,0.17,0.182
cytoplasm,0.164,0.167,0.173,0.051,0.057,0.049,0.184,0.186,0.185
membrane,0.051,0.052,0.05,0.05,0.049,0.051,0.05,0.053,0.054
organelle,0.077,0.077,0.079,0.031,0.032,0.033,0.079,0.084,0.083
cell periphery,0.017,0.017,0.018,0.025,0.025,0.027,0.015,0.014,0.014
unknown,0.511,0.507,0.503,0.779,0.774,0.774,0.496,0.483,0.473


In [18]:
membrane = {'membrane', 'cell periphery', 'organelle'}
non_membrane = set(df.index) - membrane - {'unknown'}

df.loc['mem_cp'] = df.loc[membrane].sum()
df.loc['non_mem'] = df.loc[non_membrane].sum()
df_mem = df.loc[['mem_cp', 'non_mem', 'unknown']]
df_mem

Unnamed: 0,H1-PelletTriflic-Fl-R1,H1-PelletTriflic-Fl-R2,H1-PelletTriflic-Fl-R3,H1-SonicSupTriflic-V2-R1,H1-SonicSupTriflic-V2-R2,H1-SonicSupTriflic-V2-R3,H1-wholetriflic-R1,H1-wholetriflic-R2,H1-wholetriflic-R3
mem_cp,5866.5,5896.5,5971.833,4234.333,4136.0,4245.5,5184.667,5133.833,4642.833
non_mem,13987.5,13925.5,14123.167,4616.667,4702.0,4349.5,12996.333,12466.167,11583.167
unknown,20706.0,20402.0,20361.0,31245.0,30214.0,29439.0,17889.0,16463.0,14565.0


In [20]:
df.loc['ratio'] = np.log(df_mem.loc['mem_cp']) - np.log(df_mem.loc[['mem_cp', 'non_mem']].sum())
df

Unnamed: 0,H1-PelletTriflic-Fl-R1,H1-PelletTriflic-Fl-R2,H1-PelletTriflic-Fl-R3,H1-SonicSupTriflic-V2-R1,H1-SonicSupTriflic-V2-R2,H1-SonicSupTriflic-V2-R3,H1-wholetriflic-R1,H1-wholetriflic-R2,H1-wholetriflic-R3
cellular_component,310.167,356.833,303.0,257.0,244.0,213.0,329.5,323.0,287.167
cell,7015.0,6850.5,6812.5,2305.833,2241.667,2263.833,6044.5,5795.5,5590.0
cytoplasm,6662.333,6718.167,7007.667,2053.833,2216.333,1872.667,6622.333,6347.667,5706.0
membrane,2058.0,2094.0,2037.167,2001.0,1895.5,1958.5,1819.167,1793.333,1655.667
organelle,3122.5,3099.0,3214.167,1225.0,1246.167,1268.667,2838.0,2853.5,2563.167
cell periphery,686.0,703.5,720.5,1008.333,994.333,1018.333,527.5,487.0,424.0
unknown,20706.0,20402.0,20361.0,31245.0,30214.0,29439.0,17889.0,16463.0,14565.0
mem_cp,5866.5,5896.5,5971.833,4234.333,4136.0,4245.5,5184.667,5133.833,4642.833
non_mem,13987.5,13925.5,14123.167,4616.667,4702.0,4349.5,12996.333,12466.167,11583.167
ratio,-1.219,-1.212,-1.213,-0.737,-0.759,-0.705,-1.255,-1.232,-1.251


In [21]:
df.to_csv("membrane_ratio.csv")

In [26]:
df.loc['cytoplasm_ratio'] = ( np.log(df.loc['cytoplasm']) - 
        np.log(df.loc[['cellular_component', 'cell', 'cytoplasm', 'membrane', 'organelle', 'cell periphery']].sum()) )
df.to_csv("cytoplasm_ratio.csv")
df

Unnamed: 0,H1-PelletTriflic-Fl-R1,H1-PelletTriflic-Fl-R2,H1-PelletTriflic-Fl-R3,H1-SonicSupTriflic-V2-R1,H1-SonicSupTriflic-V2-R2,H1-SonicSupTriflic-V2-R3,H1-wholetriflic-R1,H1-wholetriflic-R2,H1-wholetriflic-R3
cellular_component,310.167,356.833,303.0,257.0,244.0,213.0,329.5,323.0,287.167
cell,7015.0,6850.5,6812.5,2305.833,2241.667,2263.833,6044.5,5795.5,5590.0
cytoplasm,6662.333,6718.167,7007.667,2053.833,2216.333,1872.667,6622.333,6347.667,5706.0
membrane,2058.0,2094.0,2037.167,2001.0,1895.5,1958.5,1819.167,1793.333,1655.667
organelle,3122.5,3099.0,3214.167,1225.0,1246.167,1268.667,2838.0,2853.5,2563.167
cell periphery,686.0,703.5,720.5,1008.333,994.333,1018.333,527.5,487.0,424.0
unknown,20706.0,20402.0,20361.0,31245.0,30214.0,29439.0,17889.0,16463.0,14565.0
mem_cp,5866.5,5896.5,5971.833,4234.333,4136.0,4245.5,5184.667,5133.833,4642.833
non_mem,13987.5,13925.5,14123.167,4616.667,4702.0,4349.5,12996.333,12466.167,11583.167
ratio,-1.219,-1.212,-1.213,-0.737,-0.759,-0.705,-1.255,-1.232,-1.251
