In [1]:
#make matrix for deseq
import sys
sys.path.insert(0, "/home/gstupp/projects/metaproteomics")

In [2]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter
from glob import glob
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
matplotlib.rcParams.update({'font.size': 22})

In [3]:
from metaproteomics.file_processing.blazmass_tools import dta_select_parser, build_pep_quant_dict

In [4]:
files = glob("data/*/*/*.txt")
files[:2]

['data/PA/Pellet/PA pellet 2.txt', 'data/PA/Pellet/PA pellet 3.txt']

In [5]:
meta = pd.DataFrame([{"cell": s.rsplit("/", 1)[-1].split(" ")[0], 
  "method": s.rsplit("/", 1)[-1].split(" ")[1],
  "name": s.rsplit("/", 1)[-1].split(".")[0].replace(" ", "_"),
  "pretty_name": s.rsplit("/", 1)[-1].split(".")[0],
  "file": s } for s in files])
meta.index = meta.name
meta.to_csv("metadata.csv")
meta.head()

Unnamed: 0_level_0,cell,file,method,name,pretty_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PA_pellet_2,PA,data/PA/Pellet/PA pellet 2.txt,pellet,PA_pellet_2,PA pellet 2
PA_pellet_3,PA,data/PA/Pellet/PA pellet 3.txt,pellet,PA_pellet_3,PA pellet 3
PA_pellet_1,PA,data/PA/Pellet/PA pellet 1.txt,pellet,PA_pellet_1,PA pellet 1
PA_triflic_2,PA,data/PA/Triflic/PA triflic 2.txt,triflic,PA_triflic_2,PA triflic 2
PA_triflic_3,PA,data/PA/Triflic/PA triflic 3.txt,triflic,PA_triflic_3,PA triflic 3


In [6]:
def assign_id(locus):
    locus['id'] = locus['name'].split("|")[1] if "|" in locus['name'] else locus['name']
    return locus

In [7]:
def pick_descriptive_name(locus):
    return locus['loci'][0]['Descriptive Name']

In [8]:
uniprot_name = [{x['id']: pick_descriptive_name(assign_id(x)) for x in dta_select_parser(file, return_reverse=False)} for file in files]
uniprot_name = { k: v for d in uniprot_name for k, v in d.items() }

In [9]:
df = pd.DataFrame({file.rsplit("/", 1)[-1].split(".")[0].replace(" ", "_"): {x['id']: x['quantification'] for x in map(assign_id, dta_select_parser(file, return_reverse=False))} for file in files})
df.head()

Unnamed: 0,BS_sonic_1,BS_sonic_2,BS_sonic_3,BS_triflic_1,BS_triflic_2,BS_triflic_3,Jurkat_sonic_1,Jurkat_sonic_2,Jurkat_sonic_3,Jurkat_triflic_1,...,Jurkat_triflic_3,PA_pellet_1,PA_pellet_2,PA_pellet_3,PA_sonic_1,PA_sonic_2,PA_sonic_3,PA_triflic_1,PA_triflic_2,PA_triflic_3
A0A024QZP7,,,,,,,,,6.0,,...,,,,,,,,,,
A0A024R571,,,,,,,4.0,6.0,6.0,6.0,...,2.0,,,,,,,,,
A0A075B6E2,,,,,,,,,,4.0,...,,,,,,,,,,
A0A075B716,,,,,,,,3.0,6.0,5.0,...,,,,,,,,,,
A0A075B759,,,,,,,2.0,,,,...,,,,,,,,,,


In [10]:
df.to_csv("X.csv")

In [11]:
# run deseq.R with X.csv

In [12]:
### Add a name column and create ranked gene lists for gsea

In [13]:
#*** BS ***
bs = pd.read_csv("BS_deseq_results.csv", index_col=0)
bs['name'] = bs.index.map(uniprot_name.get)
bs.to_csv("BS_deseq_results.csv")
bs.head(2)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name,hgnc
P80239,747.568119,-4.366531,0.29876,-14.6155,2.2370759999999998e-48,2.427227e-45,Alkyl hydroperoxide reductase subunit C OS=Bac...,
P54375,329.408983,5.712131,0.430566,13.266565,3.617905e-40,1.962714e-37,Superoxide dismutase [Mn] OS=Bacillus subtilis...,


In [14]:
# write ranked list for gsea
(-1*np.log(bs.padj.dropna())).to_csv("bs.rnk", sep="\t")
# write filtered ranked list
(-1*np.log(bs[bs.padj<0.2].padj.dropna())).to_csv("bs_filtered.rnk", sep="\t")

In [15]:
#*** PA ***
pa = pd.read_csv("PA_deseq_results.csv", index_col=0)
pa['name'] = pa.index.map(uniprot_name.get)
pa.to_csv("PA_deseq_results.csv")
pa.head(2)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name
Q9HWW1,556.096497,5.233439,0.509147,10.278826,8.779037e-25,9.990544e-22,Outer membrane protein OprG OS=Pseudomonas aer...
P13794,546.292125,7.082617,0.711011,9.961336,2.250193e-23,1.28036e-20,Outer membrane porin F OS=Pseudomonas aerugino...


In [16]:
(-1*np.log(pa.padj.dropna())).to_csv("pa.rnk", sep="\t")
(-1*np.log(pa[pa.padj<0.2].padj.dropna())).to_csv("pa_filtered.rnk", sep="\t")

In [17]:
#*** Jurkat ***
j = pd.read_csv("Jurkat_deseq_results.csv", index_col=0)
j['name'] = j.index.map(uniprot_name.get)
j.to_csv("Jurkat_deseq_results.csv")
j.head(2)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name
Q7L7L0,14.962841,21.119766,2.63998,7.999971,1.244482e-15,7.529118e-13,Histone H2A type 3 OS=Homo sapiens GN=HIST3H2A...
Q9BVA1,104.959018,-9.760905,1.89827,-5.142,2.718287e-07,8.22282e-05,Tubulin beta-2B chain OS=Homo sapiens GN=TUBB2...


In [18]:
(-1*np.log(j.padj.dropna())).to_csv("j.rnk", sep="\t")
(-1*np.log(j[j.padj<0.2].padj.dropna())).to_csv("j_filtered.rnk", sep="\t")

In [19]:
##
# map uniprot to hgnc
def uniprot_to_hgnc(uniprot):
    try:
        url = "https://mygene.info/v3/query?q=uniprot.Swiss-Prot:{}&fields=HGNC,symbol".format(uniprot)
        return requests.get(url).json()['hits'][0]['symbol']
    except Exception:
        return ""
uniprot_to_hgnc("P42771")

'CDKN2A'

In [20]:
uniprot_hgnc = dict()
for x in tqdm(j.index):
    uniprot_hgnc[x] = uniprot_to_hgnc(x)

100%|██████████| 3762/3762 [12:55<00:00,  5.17it/s] 


In [21]:
j['hgnc'] = j.index.map(uniprot_hgnc.get)

In [22]:
j_hgnc = j.copy()
j_hgnc.index = j.hgnc
(-1*np.log(j_hgnc.padj.dropna())).to_csv("j_hgnc.rnk", sep="\t")

In [23]:
j_hgnc

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name,hgnc
hgnc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
HIST3H2A,14.962841,21.119766,2.639980,7.999971,1.244482e-15,7.529118e-13,Histone H2A type 3 OS=Homo sapiens GN=HIST3H2A...,HIST3H2A
TUBB2B,104.959018,-9.760905,1.898270,-5.142000,2.718287e-07,8.222820e-05,Tubulin beta-2B chain OS=Homo sapiens GN=TUBB2...,TUBB2B
,66.688826,-9.106470,1.966888,-4.629888,3.658638e-06,7.378254e-04,Cofilin-1 (Fragment) OS=Homo sapiens GN=CFL1 P...,
ALDOA,44.717244,-8.529565,1.920428,-4.441493,8.933700e-06,1.351222e-03,Fructose-bisphosphate aldolase A OS=Homo sapie...,ALDOA
,12.168573,7.478029,1.995229,3.747955,1.782824e-04,1.797681e-02,T-complex protein 1 subunit beta OS=Homo sapie...,
ATPIF1,24.961021,-7.689675,2.045004,-3.760226,1.697599e-04,1.797681e-02,"ATPase inhibitor, mitochondrial OS=Homo sapien...",ATPIF1
,11.106849,-6.520122,2.029164,-3.213206,1.312621e-03,9.926698e-02,Succinate dehydrogenase [ubiquinone] flavoprot...,
,11.520893,-6.574266,2.038856,-3.224487,1.261987e-03,9.926698e-02,Eukaryotic initiation factor 4A-I (Fragment) O...,
,11.223921,7.358281,2.336850,3.148803,1.639408e-03,1.102046e-01,T-complex protein 1 subunit epsilon OS=Homo sa...,
,10.274427,7.230559,2.348761,3.078457,2.080757e-03,1.144416e-01,T-complex protein 1 subunit epsilon OS=Homo sa...,
