In [1]:
#make matrix for deseq
import sys
sys.path.insert(0, "/home/gstupp/projects/metaproteomics")

In [2]:
import os
import requests
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter
from glob import glob
import numpy as np
from itertools import chain
import matplotlib
matplotlib.rcParams.update({'font.size': 22})

In [3]:
from metaproteomics.file_processing.blazmass_tools import dta_select_parser, build_pep_quant_dict

In [4]:
files = glob("data/*/*/*.txt")
files[:2]

['data/PA/Pellet/PA pellet 2.txt', 'data/PA/Pellet/PA pellet 3.txt']

In [5]:
meta = pd.DataFrame([{"cell": s.rsplit("/", 1)[-1].split(" ")[0], 
  "method": s.rsplit("/", 1)[-1].split(" ")[1],
  "name": s.rsplit("/", 1)[-1].split(".")[0].replace(" ", "_"),
  "pretty_name": s.rsplit("/", 1)[-1].split(".")[0],
  "file": s } for s in files])
meta.index = meta.name
meta.to_csv("metadata.csv")
meta.head()

Unnamed: 0_level_0,cell,file,method,name,pretty_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PA_pellet_2,PA,data/PA/Pellet/PA pellet 2.txt,pellet,PA_pellet_2,PA pellet 2
PA_pellet_3,PA,data/PA/Pellet/PA pellet 3.txt,pellet,PA_pellet_3,PA pellet 3
PA_pellet_1,PA,data/PA/Pellet/PA pellet 1.txt,pellet,PA_pellet_1,PA pellet 1
PA_triflic_2,PA,data/PA/Triflic/PA triflic 2.txt,triflic,PA_triflic_2,PA triflic 2
PA_triflic_3,PA,data/PA/Triflic/PA triflic 3.txt,triflic,PA_triflic_3,PA triflic 3


In [6]:
def assign_id(locus):
    locus['id'] = locus['name'].split("|")[1] if "|" in locus['name'] else locus['name']
    return locus

In [7]:
def pick_descriptive_name(locus):
    return locus['loci'][0]['Descriptive Name']

In [8]:
uniprot_name = [{x['id']: pick_descriptive_name(assign_id(x)) for x in dta_select_parser(file, return_reverse=False)} for file in files]
uniprot_name = { k: v for d in uniprot_name for k, v in d.items() }

In [9]:
df = pd.DataFrame({file.rsplit("/", 1)[-1].split(".")[0].replace(" ", "_"): {x['id']: x['quantification'] for x in map(assign_id, dta_select_parser(file, return_reverse=False))} for file in files})
df.head()

Unnamed: 0,BS_sonic_1,BS_sonic_2,BS_sonic_3,BS_triflic_1,BS_triflic_2,BS_triflic_3,Jurkat_sonic_1,Jurkat_sonic_2,Jurkat_sonic_3,Jurkat_triflic_1,...,Jurkat_triflic_3,PA_pellet_1,PA_pellet_2,PA_pellet_3,PA_sonic_1,PA_sonic_2,PA_sonic_3,PA_triflic_1,PA_triflic_2,PA_triflic_3
A0A024QZP7,,,,,,,,,6.0,,...,,,,,,,,,,
A0A024R571,,,,,,,4.0,6.0,6.0,6.0,...,2.0,,,,,,,,,
A0A075B6E2,,,,,,,,,,4.0,...,,,,,,,,,,
A0A075B716,,,,,,,,3.0,6.0,5.0,...,,,,,,,,,,
A0A075B759,,,,,,,2.0,,,,...,,,,,,,,,,


In [10]:
df.to_csv("X.csv")

In [11]:
### run deseq.R 

In [12]:
# add extra 'name' column onto deseq output. Could do this in R, but I hate r, so...

In [22]:
bs = pd.read_csv("BS_deseq_results.csv", index_col=0)
bs['name'] = bs.index.map(uniprot_name.get)
bs.to_csv("BS_deseq_results.csv")
bs.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name
P80239,747.568119,4.366532,0.29876,14.615502,2.2369979999999998e-48,2.427142e-45,Alkyl hydroperoxide reductase subunit C OS=Bac...
P54375,329.408983,-5.71213,0.430566,-13.266562,3.618028e-40,1.96278e-37,Superoxide dismutase [Mn] OS=Bacillus subtilis...
P0CI73,174.453284,-5.838118,0.460517,-12.677324,7.898288e-37,2.856547e-34,Glutamine--fructose-6-phosphate aminotransfera...
P26901,280.451418,3.304183,0.278414,11.867856,1.738671e-32,4.716144e-30,Vegetative catalase OS=Bacillus subtilis (stra...
P94391,180.938691,4.242991,0.359475,11.803305,3.752764e-32,8.143498e-30,1-pyrroline-5-carboxylate dehydrogenase 2 OS=B...


In [23]:
pa = pd.read_csv("PA_deseq_results.csv", index_col=0)
pa['name'] = pa.index.map(uniprot_name.get)
pa.to_csv("PA_deseq_results.csv")
pa.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name
Q9HWW1,556.096497,-5.233437,0.509147,-10.278822,8.779375e-25,9.990929e-22,Outer membrane protein OprG OS=Pseudomonas aer...
P13794,546.292125,-7.082614,0.711011,-9.961331,2.250297e-23,1.280419e-20,Outer membrane porin F OS=Pseudomonas aerugino...
Q9HVI2,301.440154,-11.803714,1.307331,-9.028861,1.734677e-19,6.580210000000001e-17,Uncharacterized protein OS=Pseudomonas aerugin...
Q9HUM0,147.195664,-7.388025,0.830334,-8.897659,5.703680999999999e-19,1.622697e-16,RNA-binding protein Hfq OS=Pseudomonas aerugin...
Q9I4S1,49.578217,-6.486841,0.98276,-6.600635,4.094003e-11,9.31795e-09,Uncharacterized protein OS=Pseudomonas aerugin...


In [24]:
j = pd.read_csv("Jurkat_deseq_results.csv", index_col=0)
j['name'] = j.index.map(uniprot_name.get)
j.to_csv("Jurkat_deseq_results.csv")
j.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,name
Q7L7L0,14.962841,-21.884964,2.639984,-8.28981,1.134298e-16,6.862504e-14,Histone H2A type 3 OS=Homo sapiens GN=HIST3H2A...
Q9BVA1,104.959018,9.760908,1.898267,5.142009,2.718158e-07,8.222428e-05,Tubulin beta-2B chain OS=Homo sapiens GN=TUBB2...
E9PQB7,66.688826,9.106474,1.966885,4.629896,3.658494e-06,0.0007377963,Cofilin-1 (Fragment) OS=Homo sapiens GN=CFL1 P...
P04075,44.717244,8.529568,1.920425,4.4415,8.933395e-06,0.001351176,Fructose-bisphosphate aldolase A OS=Homo sapie...
F5GWF6,12.168573,-7.478031,1.995232,-3.74795,0.0001782855,0.01797712,T-complex protein 1 subunit beta OS=Homo sapie...
