In [79]:
## Import Modules ##
#import weget # TODO: Add paths to MM outputs from wget
import numpy as np
import pandas as pd
from pathlib import Path
import csv
from collections import defaultdict

## Import Custom Modules ##
from m_gen_maps import GenMap

## Input Files ##
trans_to_gene_file = './trans_to_gene.tsv'
gtf_file = '../jurkat_analysis/a_gencode_gene_models/gencode.v35.annotation.gtf' # only need this if trans_to_gene file does not exist
pbacc_to_gene_file = './uniprot_acc_to_gencode_gene.tsv'

## MM Inputs ##
pep = './../Map_PacBio_Gene_Space/New_ForGloria/GENCODE_New_28Tryp/Task2-SearchTask/AllPeptides.psmtsv'
un_pep = './../Map_PacBio_Gene_Space/ForGloria/28FractionBUTrypsin/UniProtReviewed/Task2-SearchTask/AllPeptides.psmtsv'
pb_pep = './../Map_PacBio_Gene_Space/ForGloria/28FractionBUTrypsin/PacBio_new/Task2-SearchTask/AllPeptides.psmtsv'

## TODO's ##
# Fix uniprot gene mapping issues 
# Add path in GenMAp file to control output location 
# Put all of the analysis portion (including dictionaries?) into separate module 
# Start adding in code for peptide comparison



In [97]:
## Prepare Dictionaries for Genecode MM Outputs ##

# If Trans to Gene file does not exist, make it 
if Path(trans_to_gene_file).is_file()==False:
    # TODO: Add path in GenMap file to control output location
    GenMap(gtf_file, 'trans_to_gene')

# Make dictionary of transcript_name -> gene 
trans_to_gene = pd.read_csv(trans_to_gene_file, sep='\t')
trans_to_gene.columns = ['A', 'B']
gdict = pd.Series(trans_to_gene.A.values, index = trans_to_gene.B).to_dict()

## Prepare Dictionaries for Uniprot MM Outputs ##
pbacc_to_gene = pd.read_csv(pbacc_to_gene_file, sep='\t').dropna()
udict = pbacc_to_gene[['uniprot_gene', 'gencode_genes']].set_index('uniprot_gene').T.to_dict('list')


In [40]:
# Import Peptides Dataset
cols = ['Base Sequence', 'Protein Accession', 'Decoy/Contaminant/Target', 'QValue']
data = pd.read_csv(pep, delimiter=r"\t", usecols = cols)
data.columns = ['seq', 'pb_acc', 'dct', 'qval']

# Filter Data
fdata = data[data['qval'] <= 0.01]
tdata = fdata[fdata['dct'] == 'T']

# Extract pb_acc col and split each row 
cut = tdata[['pb_acc']]
split = cut.pb_acc.str.split('\||\.', expand=True)

# Replace pb_acc (transcript_name) -> gene_name and remove duplicates
# TODO: There are some transcripts that don't map to genes. Ignoring those for now
gen = split.apply(lambda x: x.map(gdict, na_action='ignore')) 
tdata['gene'] = gen.stack().groupby(level=0).apply(lambda x: x.unique().tolist())


# Gene based results
p = tdata.gene.apply(pd.Series)
p.insert(0,'seq', tdata.seq.values)

# sort the sequences by gene
sort = p.melt(id_vars=['seq'], value_name="gene").dropna().reset_index(drop=True).drop('variable',1)
pro = sort.groupby('gene')


In [109]:
# Import Peptides Dataset
cols = ['Base Sequence', 'Gene Name', 'Decoy/Contaminant/Target', 'QValue']
data = pd.read_csv(un_pep, delimiter=r"\t", usecols = cols)
data.columns = ['seq', 'gene', 'dct', 'qval']

# Remove 'primary:' from gene column 
data['gene'] = data['gene'].str.replace('primary:', '')

# Filter Data
fdata = data[data['qval'] <= 0.01]
tdata = fdata[fdata['dct'] == 'T']
print(tdata.shape)

# Extract pb_acc col and split each row 
cut = tdata[['gene']]
split = cut.gene.str.split('\||\.', expand=True)


# Replace transcript name with gencode gene name, keep any genes that don't match to gencode
gen = split.apply(lambda x: x.map(udict, na_action='ignore')).fillna(split)
print(gen.shape)

stack = gen.stack().groupby(level=0) #.apply(lambda x: x.unique().tolist())
print(stack)





(76690, 4)
(76690, 9)
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002279E3F6CA0>


In [61]:
# Import Peptides Dataset
cols = ['Base Sequence', 'Gene Name', 'Decoy/Contaminant/Target', 'QValue']
data = pd.read_csv(pb_pep, delimiter=r"\t", usecols = cols)
data.columns = ['seq', 'gene', 'dct', 'qval']

# Remove 'primary:' from gene column 
data['gene'] = data['gene'].str.replace('primary:', '')

# Filter Data
fdata = data[data['qval'] <= 0.01]
tdata = fdata[fdata['dct'] == 'T']

# Extract pb_acc col and split each row 
cut = tdata[['gene']]
split = cut.gene.str.split('\||\.', expand=True)

# Remove duplicate genes and sort by gene
tdata['gene'] = split.stack().groupby(level=0).apply(lambda x: x.unique().tolist())

# Gene based results
p = tdata.gene.apply(pd.Series)
p.insert(0,'seq', tdata.seq.values)

# sort the sequences by gene
sort = p.melt(id_vars=['seq'], value_name="gene").dropna().reset_index(drop=True).drop('variable',1)
pro = sort.groupby('gene')