# Unique and Shared Isoforms
Extracts information of the number of unique and shared isoforms found within the sample, then saves analysis as tables.


In [1]:


import os
from re import sub


from huvec_analysis import huvec_config, data_loader
import pandas as pd 
import matplotlib.pyplot as plt 
import os 
import matplotlib
matplotlib.rc('font', **huvec_config.font)

# all plots go into a directory
if not os.path.exists('plot'):
    os.makedirs('plot')

# all statistics go into a directory
if not os.path.exists('stats'):
    os.makedirs('stats')


In [2]:

# read in the paths to the sqanti data 
sqanti_info_path = '../00_pre_analysis/sqanti_info/sqanti_info.tsv'
fasta_file = f'{huvec_config.PIPELINE_RESULTS_DIRECTORY}/hybrid_protein_database/huvec_hybrid.fasta'
pacbio_protein_grps_path = '../00_pre_analysis/metamorpheus_table/AllProteinGroups.PacBioHybrid.tsv'
pacbio_peps_path = f'{huvec_config.PIPELINE_RESULTS_DIRECTORY}/metamorpheus/pacbio/hybrid/search_results/Task1SearchTask/AllPeptides.huvec.hybrid.psmtsv'
pacbio_peps_filtered_path = '../00_pre_analysis/metamorpheus_table/AllPeptides.PacBioHybrid.tsv'
gencode_pr_grps_path = f'{huvec_config.PIPELINE_RESULTS_DIRECTORY}/metamorpheus/gencode/search_results/Task1SearchTask/AllQuantifiedProteinGroups.Gencode.tsv'
human_ec_gene_path = f'{huvec_config.REFERENCE_DIRECTORY}/human_ec_genes_from_karen.txt'
#read in the files 
sqanti_info = pd.read_table(sqanti_info_path)
pacbio_pr_grps = pd.read_table(pacbio_protein_grps_path, index_col= False)
pacbio_pr_grps = pacbio_pr_grps.dropna(subset=['Gene'])
# read in the GENCODE protein groups
gencode_pr_grps = pd.read_table(gencode_pr_grps_path, index_col= False)
gencode_pr_grps = gencode_pr_grps.dropna(subset=['Gene'])
human_ec_genes = pd.read_table(human_ec_gene_path, header = None)[0].to_list()


In [3]:
# expand the gene column and split on the |
# pacbio_pr_grps = pacbio_pr_grps.head()
pacbio_pr_grps['gene_list'] = pacbio_pr_grps['Gene'].str.split("|")
pacbio_pr_grps['gene_set'] = pacbio_pr_grps['gene_list'].apply(set)

# so now drop any columns where there is more than 1 value in the set 
def has_multiple(group):
    return len(group) > 1
pacbio_pr_grps['has_mult_isos'] = pacbio_pr_grps['gene_set'].apply(has_multiple)
pacbio_pr_grps['has_mult_isos'] = pacbio_pr_grps['has_mult_isos'].astype(int)
pacbio_pr_grps['gene_name'] = pacbio_pr_grps['Gene'].str.split('|').str[0]
pacbio_pr_grps['ec_priority'] = pacbio_pr_grps['Gene'].isin(human_ec_genes) * 1
#pacbio_pr_grps.to_csv('stats/01_shared_iso.csv', index=False)


In [4]:

# find all the values where has_mult_iso i true 
#multi_mapping_isos = pacbio_pr_grps[pacbio_pr_grps['has_mult_isos']==True]
#multi_mapping_isos.to_csv('stats/shared_iso_is_true')

single_gene_pr_grps_pacbio = pacbio_pr_grps[~pacbio_pr_grps['gene_set'].apply(has_multiple)]
pacbio_num_genes = single_gene_pr_grps_pacbio.groupby('gene_name').agg(number_of_groups=('Gene', 'count'))

# perform the simialar analysis for the gencode protein groups we discovered 
# so now drop any columns where there is more than 1 value in the set 
gencode_pr_grps['gene_list'] = gencode_pr_grps['Gene'].str.split("|")
gencode_pr_grps['gene_set'] = gencode_pr_grps['gene_list'].apply(set)

gencode_pr_grps['has_mult_isos'] = gencode_pr_grps['gene_set'].apply(has_multiple)
gencode_pr_grps['has_mult_isos'] = gencode_pr_grps['has_mult_isos'].astype(int)
gencode_pr_grps['gene_name'] = gencode_pr_grps['Gene'].str.split('|').str[0]
gencode_pr_grps['ec_priority'] = gencode_pr_grps['Gene'].isin(human_ec_genes) * 1

single_gene_pr_grps_gencode = gencode_pr_grps[~gencode_pr_grps['gene_set'].apply(has_multiple)]
gencode_num_genes = single_gene_pr_grps_gencode.groupby('gene_name').agg(number_of_groups=('Gene', 'count'))


In [5]:

# find for how many unique peptide found between pb and gc 
pb_uniq = pacbio_pr_grps[pacbio_pr_grps['Number of Unique Peptides']==1]
gc_uniq = gencode_pr_grps[gencode_pr_grps['Number of Unique Peptides'] == 1]

# look at the peptides file to find all the columns with 1 accession 
pacbio_peps_filtered = pd.read_table(pacbio_peps_filtered_path)

# find the occurences of | within the protein accession column 
def read_peps_file(peps_file):
    peps_table = pd.read_table(pacbio_peps_filtered_path, sep = '\t')
    sub = '|'
    peps_table['unique_isoform'] = peps_table['Protein Accession'].str.find(sub)
    gencode_peps_isoform_resolve = peps_table[peps_table['unique_isoform']== -1]
    return gencode_peps_isoform_resolve
pacbio_peps_filtered = read_peps_file(pacbio_peps_filtered_path)
pacbio_peps_filtered.to_csv('stats/test_isoform_resolve.tsv', sep = '\t')
# find for how many have multi mapping accessions (indicating for how many peps are mapping to multiple isos)


In [6]:
# read in the pacbio filtered peps 
pacbio_peps_filtered = pd.read_table(pacbio_peps_filtered_path)
pacbio_peps_filtered

# find the | substring within the protein accession 
sub = '|'
pacbio_peps_filtered['unique_isoform'] = pacbio_peps_filtered['Protein Accession'].str.find(sub)
pacbio_peps_filtered['character_count'] = pacbio_peps_filtered['Protein Accession'].str.count('\|')
pacbio_peps_filtered.to_csv('stats/pacbio_peps_unique_isos.tsv', sep = '\t')

# find how many times the unique_isoform does not equal -1
multi_map_isos = pacbio_peps_filtered[pacbio_peps_filtered['unique_isoform']!=-1]
len(multi_map_isos)
multi_map_isos.gene_name.nunique()


4790

In [7]:


# find the number of single mapping isos
single_map_isos = pacbio_peps_filtered[pacbio_peps_filtered['unique_isoform']==-1]

unique_single_isos = single_map_isos.gene_name.unique()
unique_single_isos_set = set(unique_single_isos)
pacbio_pr_grps = pd.read_table(pacbio_protein_grps_path)

# find how many unique peptides that were detected 
unique_peps = pacbio_pr_grps[pacbio_pr_grps['Number of Unique Peptides']==1]



In [8]:

# look at the protein groups file to find the multi-mapping protein accession rows 
pacbio_pr_grps['unique_isoform'] = pacbio_pr_grps['Protein Accession'].str.find(sub)
pacbio_pr_grps['ec_priority'] = pacbio_pr_grps['gene_name'].isin(human_ec_genes) * 1
pacbio_pr_grps.to_csv('stats/pacbio_pr_grp.tsv', '\t')
not_unique_pr_grps = pacbio_pr_grps[pacbio_pr_grps['unique_isoform']!=-1]
len(not_unique_pr_grps)


3975

In [10]:
# table of the peptides found in allpeps but not in AllProteingrps
common = single_map_isos['Base Sequence'].isin(pacbio_pr_grps['Unique Peptides'])

single_map_isos['peptide_is_in_common'] = single_map_isos['Base Sequence'].isin(pacbio_pr_grps['Unique Peptides'])
common_peptide_to_both = single_map_isos[single_map_isos['peptide_is_in_common']==False]
common_peptide_to_both.to_csv('stats/common_peptide_to_both_isFalse.tsv', sep = '\t')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_map_isos['peptide_is_in_common'] = single_map_isos['Base Sequence'].isin(pacbio_pr_grps['Unique Peptides'])
