In [1]:
import sys
sys.path.insert(0, "/home/gstupp/projects/metaproteomics")

In [2]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from glob import glob
import numpy as np

In [3]:
from metaproteomics.file_processing.blazmass_tools import dta_select_parser
from metaproteomics.goatools import obo_parser
goa = obo_parser.GODag()

load obo file /home/gstupp/projects/metaproteomics/metaproteomics/goatools/go-basic.obo
48437 nodes imported


In [4]:
files = glob("data/*/*/*.txt")

In [5]:
files

['data/PA/Triflic/PA triflic 4.txt',
 'data/PA/Triflic/PA triflic 3.txt',
 'data/PA/Triflic/PA triflic 5.txt',
 'data/PA/Sonic/PA sonic 1.txt',
 'data/PA/Sonic/PA sonic 3.txt',
 'data/PA/Sonic/PA sonic 4.txt',
 'data/Jurkat/Triflic/Jurkat triflic 1.txt',
 'data/Jurkat/Triflic/Jurkat triflic test.txt',
 'data/Jurkat/Triflic/Jurkat triflic 2.txt',
 'data/Jurkat/Sonic/Jurkat sonic 7.txt',
 'data/Jurkat/Sonic/Jurkat sonic 1.txt',
 'data/Jurkat/Sonic/Jurkat sonic 8.txt',
 'data/BS/Triflic/BS triflic 3.txt',
 'data/BS/Triflic/BS triflic 1.txt',
 'data/BS/Triflic/BS triflic 2.txt',
 'data/BS/Sonic/BS sonic 3.txt',
 'data/BS/Sonic/BS sonic 4.txt',
 'data/BS/Sonic/BS sonic 2.txt']

In [7]:
df = pd.read_csv("annotations.mapped.gaf", sep="\t", comment="!", header=None)
df = df.rename(columns={1:"id", 4:"go_id", 8:'aspect'})
df = df.query("aspect == 'C'")
df.head(2)

Unnamed: 0,0,id,2,3,go_id,5,6,7,aspect,9,10,11,12,13,14,15,16
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0043234,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|hCG_1994888,protein,taxon:9606,20170603,InterPro,,
3,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005622,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|hCG_1994888,protein,taxon:9606,20170603,InterPro,,


In [8]:
go_annot = df.groupby("id").agg({'go_id': lambda x: set(x)}).to_dict()['go_id']
go_annot['A0A024R161']

{'GO:0005575', 'GO:0005622', 'GO:0005886', 'GO:0043234'}

In [10]:
#{goa[x].name: x for x in set(df[df['aspect'] == 'C']['go_id'])}

In [11]:
def get_go_count(file):
    p = dict()
    d = defaultdict(int)
    for locus in dta_select_parser(file, return_reverse=False):
        if '|' in locus['name']:
            uniprot = locus['name'].split('|')[1]
            this_go = go_annot.get(uniprot, [])
            p[uniprot] = {'go': ";".join(this_go), 'q': locus['quantification'], 
                          'go_name': ";".join([goa[x].name for x in this_go]), 
                          'name': locus['loci'][0]['Descriptive Name']}
            for go in this_go:
                d[go]+=locus['quantification']
    d = dict(d)
    return d, p
def get_go_count2(file):
    """
    version 2: if a protein has multiple go terms, give those go terms the number of spectrral counts/# Go terms
    """
    p = dict()
    d = defaultdict(int)
    for locus in dta_select_parser(file, return_reverse=False):
        if '|' in locus['name']:
            uniprot = locus['name'].split('|')[1]
            this_go = go_annot.get(uniprot, [])
            p[uniprot] = {'go': ";".join(this_go), 'q': locus['quantification'], 
                          'go_name': ";".join([goa[x].name for x in this_go]), 
                          'name': locus['loci'][0]['Descriptive Name']}
            for go in this_go:
                d[go]+=locus['quantification']/len(this_go)
    d = dict(d)
    return d, p
d,p = get_go_count(files[0])
d2,p = get_go_count2(files[0])

In [12]:
def get_membrane_count(d):
    # cell wall, plasma membrane
    membrane = {'GO:0005618', 'GO:0005886'}
    membrane_count = sum(count for go,count in d.items() if go in membrane)
    not_membrane_count = sum(count for go,count in d.items() if go not in membrane)
    return membrane_count, not_membrane_count

In [13]:
counts = dict()
for file in tqdm(files):
    d,p = get_go_count2(file)
    membrane_count, not_membrane_count = get_membrane_count(d)
    counts[file] = (membrane_count, not_membrane_count, np.log(not_membrane_count)-np.log(membrane_count))
    pd.DataFrame(p).T.to_csv(file + "_go_counts.csv")

100%|██████████| 18/18 [00:03<00:00,  4.96it/s]


In [16]:
df_out = pd.DataFrame({'files': list(counts.keys()),
 'membrane': [x[0] for x in counts.values()],
 'non-membrane': [x[1] for x in counts.values()],
 'ratio': [x[2] for x in counts.values()],
 'method': ["sonic" if "sonic" in file else "tri" for file in counts],
 'cell': [file.split("/")[1] for file in counts]})
df_out = df_out.sort_values(["cell", "method"])
df_out

Unnamed: 0,cell,files,membrane,method,non-membrane,ratio
1,BS,data/BS/Sonic/BS sonic 4.txt,784.75,sonic,15111.25,2.95783
7,BS,data/BS/Sonic/BS sonic 2.txt,897.714286,sonic,16292.285714,2.898595
10,BS,data/BS/Sonic/BS sonic 3.txt,743.880952,sonic,14564.119048,2.974435
2,BS,data/BS/Triflic/BS triflic 1.txt,721.345238,tri,12720.654762,2.869864
5,BS,data/BS/Triflic/BS triflic 3.txt,1355.5,tri,25859.5,2.948508
9,BS,data/BS/Triflic/BS triflic 2.txt,1088.785714,tri,21741.214286,2.994147
0,Jurkat,data/Jurkat/Sonic/Jurkat sonic 1.txt,978.511701,sonic,20908.488299,3.061878
4,Jurkat,data/Jurkat/Sonic/Jurkat sonic 8.txt,765.967763,sonic,21727.032237,3.345172
12,Jurkat,data/Jurkat/Sonic/Jurkat sonic 7.txt,1193.21163,sonic,23654.78837,2.986917
13,Jurkat,data/Jurkat/Triflic/Jurkat triflic test.txt,311.434155,tri,8308.565845,3.283854


In [17]:
df_out.to_csv("df_out.csv")