# Enhanced protein isoform characterization through long-read proteogenomics 
## Supplementary Table 6
---
Ben Jordan

In [1]:
import data_loader
import config
import pandas as pd
PG = data_loader.ProteinGroup()


In [2]:
refined_file = f'{config.PIPELINE_RESULTS_DIRECTORY}/protein_gene_rename/jurkat_orf_refined_gene_update.tsv'

cpm_data = pd.read_table(refined_file, usecols=['base_acc', 'pr_gene','CPM'])
cpm_data = cpm_data[cpm_data['base_acc'].isin(data_loader.accs_in_hiconf_space)]
acc_cpm = pd.Series(cpm_data.CPM.values,index=cpm_data.base_acc).to_dict()
gene_cpm = cpm_data.groupby('pr_gene')['CPM'].sum().reset_index()
gene_cpm = pd.Series(gene_cpm.CPM.values,index=gene_cpm.pr_gene).to_dict()

## Rescued Groups

In [3]:
def get_all_hybrid_protein_group(protein_groups_filename, gene_map):
    protein_groups = (
        pd.read_table(
            protein_groups_filename,
            index_col=False,
            usecols=[0,1,5,6,7,8,9,16,19], 
            names=['protein_group','pg_gene','num_proteins_in_group','unique_peptides','shared_peptides','number_of_peptides','number_of_unique_peptides','dct','qval'], 
            skiprows=1)
        .query('dct == "T"')
    )
    protein_groups['accs'] = protein_groups['protein_group'].str.split('|')
    protein_groups['genes']= protein_groups['protein_group'].apply(lambda pgroup: data_loader.get_genes_in_protein_group(pgroup, gene_map))
    protein_groups['is_high_confidence'] = protein_groups['accs'].apply(lambda accs: data_loader.is_high_confidence(accs, data_loader.accs_in_hiconf_space))
    protein_groups= protein_groups.set_index('protein_group')
    return protein_groups

def table_rescued_groups(rescue_resolve_protein_groups, hybrid_protein_groups,acc_cpm, gene_cpm ):
    hybrid_group_accs = set(hybrid_protein_groups.index)
    
    rr_group_accs = set(rescue_resolve_protein_groups.index)
    rescued_group_accs = rr_group_accs - hybrid_group_accs
    rescued_groups = rescue_resolve_protein_groups.loc[rescued_group_accs]
    rescued_groups['absolute_CPM']= rescued_groups['accs'].apply(lambda accs: [round(acc_cpm[x],2) if x in acc_cpm.keys() else 0 for x in accs]  )
    rescued_groups['total_CPM'] = rescued_groups['absolute_CPM'].apply(lambda cpm: round(sum(cpm),2))
    rescued_groups['relative_CPM'] = rescued_groups.apply(lambda row: [round(abs_cpm/row['total_CPM'],2) for abs_cpm in row['absolute_CPM']],axis=1)
    rescued_groups['absolute_CPM'] = rescued_groups['absolute_CPM'].apply(lambda cpm: '|'.join([str(x) for x in cpm]))

    rescued_groups = rescued_groups.reset_index()
    rescued_groups = rescued_groups[['pg_gene','protein_group','absolute_CPM']]
    rescued_groups = rescued_groups.rename(columns = {
        'pg_gene' : 'Genes',
        'protein_group' : 'Protein Group',
        'absolute_CPM' : 'Absolute CPM',
    }) 
    rescued_groups.to_csv('stats/SuppTable6_rescue.tsv', sep='\t', index=False)
    return rescued_groups

    
pacbio_rescue_resolve_protein_group = PG.pacbio_rescue_resolve_protein_group.set_index('protein_group')
pacbio_rescue_resolve_protein_group = pacbio_rescue_resolve_protein_group[pacbio_rescue_resolve_protein_group['is_high_confidence']]
pacbio_hybrid_protein_group = get_all_hybrid_protein_group(data_loader.pacbio_hybrid_protein_group_file, data_loader.hybrid_gene_map)
rescued_table = table_rescued_groups(pacbio_rescue_resolve_protein_group, pacbio_hybrid_protein_group, acc_cpm, gene_cpm)

## Resolved Table

In [4]:
def get_relative_cpm(row, accession_cpm):
    absolute_cpm = [accession_cpm[acc] if acc in accession_cpm.keys() else 0 for acc in row['accs']]
    total_cpm = sum(absolute_cpm)
    if total_cpm > 0:
        relative_cpm = [cpm / total_cpm for cpm in absolute_cpm]
    else:
        relative_cpm = [0 for cpm in absolute_cpm]
    return relative_cpm


def get_resolved_accession(row, rescue_threshold = 0.9):
    if row['num_proteins_in_group'] == 1:
        return 'X'
    for acc, relative_cpm in zip(row['accs'], row['Relative CPM abundances']):
        if relative_cpm > rescue_threshold:
            return acc
    return 'X'

def table_resolve(rescue_resolve_protein_groups, accession_cpm):
    rescue_resolve_protein_groups['Relative CPM abundances'] = rescue_resolve_protein_groups.apply(lambda row: get_relative_cpm(row, accession_cpm), axis = 1)
    rescue_resolve_protein_groups['Resolved Accession'] = rescue_resolve_protein_groups.apply(get_resolved_accession, axis = 1)
    rescue_resolve_protein_groups['Is Resolved'] = rescue_resolve_protein_groups['Resolved Accession'].apply(lambda x: 'YES' if x != 'X' else 'NO')
    rescue_resolve_protein_groups.rename(columns = {
        'protein_group' : 'Protein Group (original)',
        'pg_gene': 'Genes'
    }, inplace=True)
    rescue_resolve_protein_groups['Relative CPM abundances'] = rescue_resolve_protein_groups['Relative CPM abundances'].apply(lambda vals: [round(x,2) for x in vals])
    rescue_resolve_protein_groups = rescue_resolve_protein_groups[['Protein Group (original)', 'Genes', 'Relative CPM abundances', 'Resolved Accession', 'Is Resolved']]
    rescue_resolve_protein_groups['Relative CPM abundances'] = rescue_resolve_protein_groups['Relative CPM abundances'].apply(lambda vals: [str(x) for x in vals])
    rescue_resolve_protein_groups['Relative CPM abundances'] = rescue_resolve_protein_groups['Relative CPM abundances'].apply(lambda vals: '|'.join(vals))
    rescue_resolve_protein_groups.to_csv('stats/SuppTable6_resolve.tsv', sep='\t', index=False)
    return rescue_resolve_protein_groups

def are_all_pb(accs):
    for acc in accs:
        if not acc.startswith('PB'):
            return False
    return True

In [5]:
pacbio_rescue_resolve_protein_group = PG.pacbio_rescue_resolve_protein_group
pacbio_rescue_resolve_protein_group['all_pb'] = pacbio_rescue_resolve_protein_group['accs'].apply(are_all_pb)
pacbio_rescue_resolve_protein_group = pacbio_rescue_resolve_protein_group[pacbio_rescue_resolve_protein_group['all_pb']]
pacbio_rescue_resolve_protein_group = pacbio_rescue_resolve_protein_group[pacbio_rescue_resolve_protein_group['num_proteins_in_group'] > 1]

# pacbio_rescue_resolve_protein_group = pacbio_rescue_resolve_protein_group[pacbio_rescue_resolve_protein_group['protein_group'].str.contains('|')]
# pacbio_rescue_resolve_protein_group = pacbio_rescue_resolve_protein_group[pacbio_rescue_resolve_protein_group['is_high_confidence']]
resolved_table = table_resolve(pacbio_rescue_resolve_protein_group.copy(), acc_cpm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rescue_resolve_protein_groups['Relative CPM abundances'] = rescue_resolve_protein_groups['Relative CPM abundances'].apply(lambda vals: [str(x) for x in vals])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rescue_resolve_protein_groups['Relative CPM abundances'] = rescue_resolve_protein_groups['Relative CPM abundances'].apply(lambda vals: '|'.join(vals))


In [6]:
rescued_table

Unnamed: 0,Genes,Protein Group,Absolute CPM
0,IFNAR2,PB.15763.27,40.06
1,HMGN2,PB.290.1,4168.55
2,DNTT,PB.8755.10,52.87
3,ZNF512,PB.1833.9,58.87
4,CHI3L2,PB.831.39,29.71
...,...,...,...
350,ZWINT,PB.8539.24,39.52
351,SRSF1,PB.13660.1,48.51
352,BCCIP,PB.8955.7,48.24
353,RNF40,PB.12557.19,82.85


In [9]:
resolved_table.groupby('Is Resolved').size()

Is Resolved
NO     1163
YES    1434
dtype: int64

In [8]:
with pd.ExcelWriter('stats/SuppTable6.xlsx') as writer:
    rescued_table.to_excel(writer, sheet_name = 'Rescued Groups', index=False)
    resolved_table.to_excel(writer, sheet_name= 'Resolved Groups', index=False)