In [1]:
import pandas as pd
import os
import glob
import timeit
import numpy as np
import h5py
from collections import defaultdict
from helpers_STAR import * 

In [None]:
# -------- Path STAR
path_star = '/cluster/work/grlab/projects/GTEx/rna_gencode32_realign/results'
# Star junctions - unique coordinates
star_jx = os.path.join(path_star, 'junctions_spladder.all_coords.sorted.uniq.tsv.gz')
# Star junctions - projected coordinates and expression
#projected_chr_file = os.path.join(path_star, f'junctions_spladder_projected/junctions_spladder.projected.{chrm}.hdf5')


# -------- Intermediate filtering results (threshold and merged)
#TODO DO the generation matrix???
# Foreground matrix
big_matrix = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_intermediate/complete_cancer_candidates_order_r_complete.tsv.gz'



# -------- GTEX filtering 
whitelist = '/cluster/work/grlab/projects/projects2020_OHSU/sample_lists/GTEX/GTEx_sample_IDs_10-2021_lib_graph_juliannelist'
libsize = '/cluster/work/grlab/projects/TCGA/PanCanAtlas/immunopepper_paper/peptides_ccell_rerun_gtex_151220/ARCHIV_keep_runs/GTEX2019_commit_v3_TEST_merged3_372a147_medium_run_pya.0.17.1_conf2_annot_ref_chrall_cap/expression_counts.libsize.tsv'


normalizer = 400000
filter_thresholds = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]
jx_target_list = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_22March_order_wany_wAnnot/tmp_all_experiments_jx.txt'

# Load

In [None]:
# Load cancer generation matrix
mx = pd.read_csv(big_matrix, sep = '\t')
display(mx.head())
# Add split junction information to generation table
coord_mx = explode_immunopepper_coord(mx)
display(coord_mx.head())
mx = pd.concat([mx, coord_mx[['strand', 'junction_coordinate1', 'junction_coordinate2']]], axis = 1)
print(mx.shape)

if jx_target_list:
    target_jx = pd.read_csv(jx_target_list, header = None) 
    target_jx.columns = ['coord']
    mx = mx.merge(target_jx, on = 'coord', how = 'inner')
print(mx.shape)   

In [None]:
print(mx.shape)
print()

In [None]:
# LOAD STAR junctions
star_jx = pd.read_csv(star_jx, sep = '\t')
star_jx.head()
# Add STAR junction column 10 min
star_jx['junction_coordinate'] = star_jx['junction_start'].astype(str) + ':' + star_jx['junction_end'].astype(str)


# Check junction presence in STAR 
- 2-exons, 1st junction is present 
- 3 exons, needs first and second junction present 

In [None]:
# Foreground Kmers from 1 junction
one_jx = mx[(mx['junction_coordinate1'] != 'None') & (mx['junction_coordinate2'] == 'None')]
print(one_jx.shape)

In [None]:
# Foreground Kmers from 2 junctions
two_jx = mx[(mx['junction_coordinate1'] != 'None') & (mx['junction_coordinate2'] != 'None')]
print(two_jx.shape)

In [None]:
# Foreground 1 junction - NOT IN STAR 
isstar_one = set(one_jx['junction_coordinate1']).intersection(set(star_jx['junction_coordinate'])) # junction coordinates
print(len(isstar_one))
is_star_one_jx = set(one_jx.set_index('junction_coordinate1').loc[isstar_one, 'coord']) # corresonding kmer coordinates
print(len(is_star_one_jx))

In [None]:
# Foreground 2 junctions - NOT IN STAR 
is_star_two1 = set(two_jx['junction_coordinate1']).intersection(set(star_jx['junction_coordinate'])) # Junction coordinates
is_star_two2 = set(two_jx['junction_coordinate2']).intersection(set(star_jx['junction_coordinate'])) # Junction coordinates
print(len(is_star_two1))
print(len(is_star_two2))

is_star_two_jx = set(two_jx.set_index('junction_coordinate1').loc[is_star_two1, 'coord']).intersection(\
set(two_jx.set_index('junction_coordinate2').loc[is_star_two2, 'coord'])) # Corresponding kmer coord
print(len(is_star_two_jx))

In [None]:
print(len(is_star_one_jx))
print(len(is_star_two_jx))

In [None]:
# Foreground Table - Create FLAG GTEX junctions
start_time = timeit.default_timer()
mx['STAR_GTEx_jx'] = False
mx = mx.set_index('coord')
mx.loc[list(is_star_one_jx.union(is_star_two_jx)), 'STAR_GTEx_jx'] = True
mx = mx.reset_index()
print(timeit.default_timer() - start_time)    

# Retrieve expression 

### Extract metadata

In [3]:
# Select junctions to query in STAR file 
query = mx.loc[mx['STAR_GTEx_jx'] == True, ['strand', 'junction_coordinate1',
                                            'junction_coordinate2', 'STAR_GTEx_jx']].drop_duplicates()\
                                                                                    .reset_index()\
                                                                                    .drop('index', axis = 1)

assert(query.loc[:, ['junction_coordinate1', 'strand']].drop_duplicates().shape[0] == \
       query.loc[:, ['junction_coordinate1']].drop_duplicates().shape[0])

NameError: name 'mx' is not defined

In [None]:
# Query-Junction to strand 
jx_strand = {}
for i in np.arange(query.shape[0]):
    jx_strand[query.loc[i, 'junction_coordinate1']] = query.loc[i, 'strand']

for i in np.arange(query.shape[0]):
    jx_strand[query.loc[i, 'junction_coordinate2']] = query.loc[i, 'strand']

jx_strand.pop('None')

In [15]:
# Chromosome to query-junction
star_jx = star_jx.set_index('junction_coordinate') #Faster

intermediate = star_jx.loc[jx_strand.keys(), :]

start_time = timeit.default_timer()
chr_jx = defaultdict(set)
for jx, chrm in zip(intermediate.index, intermediate['chr']):
    chr_jx[chrm].add(jx)
print(timeit.default_timer() - start_time)
    

0.016046482021920383


### Collect Expression 

In [None]:
### Thresholding experiments 
# Remark 1 junction can have multiple chrm --> Which 
# Remark 1 junction on 1 chrm can have multiple strands --> Take from Immunopepper

res = collect_expression_thresholds(libsize, whitelist, normalizer, filter_thresholds, 
                                    path_star, chr_jx, jx_strand )

chr1


In [None]:
# OPTIONAL 
tot = 0 
for chrm, jxS in chr_jx.items():
    print(f'chrm {chrm } number junctions {len(jxS)}')
    tot+= len(jxS)
print(f'Total number junctions {tot}')

In [None]:
gtex_cols = ['gtexCohortfilter >0.0', 'gtexCohortfilter >=1.0',
       'gtexCohortfilter >=2.0', 'gtexCohortfilter >=3.0',
       'gtexCohortfilter >=5.0', 'gtexCohortfilter >=10.0']

In [None]:
# Expression to DF
df_res = pd.DataFrame(res, columns = ['junction_coordinate', 'strand_STAR', 'chr'] + gtex_cols)
print(df_res.shape)
display(df_res.head())

### Format results

In [None]:
# One junction merge (straightforward merge on junction)
print(one_jx.shape)
# Remove the GTEX quantifications from immunopepper
base_one_jx = one_jx.drop(gtex_cols, axis = 1).drop_duplicates() 
base_one_jx.head()
print(base_one_jx.shape)
# Add GTEX quantifications from STAR
one_jx_quantified = base_one_jx.merge(df_res, left_on = 'junction_coordinate1', 
                                      right_on = 'junction_coordinate', how = 'left') 
print(one_jx_quantified.columns)
display(one_jx_quantified.tail())

In [None]:
# Two junctions merge (not straight forwrds, consider botrh junctions separately and take the max)
print(two_jx.shape)
# Remove the GTEX quantifications from immunopepper
base_two_jx = two_jx.drop(gtex_cols, axis = 1).drop_duplicates() 
base_two_jx.head()
print(base_two_jx.shape)
# Add GTEX quantifications from STAR  # first junction
two_jx_quantified_left = base_two_jx.merge(df_res, left_on = 'junction_coordinate1', 
                                      right_on = 'junction_coordinate', how = 'left')


# Add GTEX quantifications from STAR  # second junction
two_jx_quantified_right = base_two_jx.merge(df_res, left_on = 'junction_coordinate2', 
                                      right_on = 'junction_coordinate', how = 'left')

col_merge = [col for col in two_jx_quantified_right if col not in gtex_cols]

# Add GTEX quantifications from STAR  # both junctions
two_jx_quantified = two_jx_quantified_left.merge(two_jx_quantified_right, on = col_merge, how = 'outer')

# Add GTEX quantifications from STAR  # MAX (!!) over 2 junctions
for col in gtex_cols:
    two_jx_quantified[col] = two_jx_quantified[[col + '_x', col + '_y']].max(skipna = True, axis = 1)
    two_jx_quantified = two_jx_quantified.drop([col + '_x', col + '_y'], axis = 1)
    
print(two_jx_quantified.columns)
display(two_jx_quantified.tail())

In [None]:
# All Junctions quantified on STAR for GTEX
print(two_jx_quantified.shape)
print(one_jx_quantified.shape)
jx_quantified = pd.concat([one_jx_quantified, two_jx_quantified])
print(jx_quantified.shape)

In [None]:
### Save 
save_path = big_matrix.replace('tsv.gz', 'STAR_GTEx.tsv.gz')
print(save_path)
jx_quantified.to_csv(save_path, compression = 'gzip', index = False, sep = '\t')

# Quality check

In [None]:
chrm_list = ['chr2', 'chr14']
strand = '-'
jx = '102015975:102016000'
junction_start = int(jx.split(':')[0])
junction_end = int(jx.split(':')[1])

for chrm in chrm_list:
    expression_h5, index_whitelist_samples, lib_75_per_sample = preprocess_STAR_projected(chrm, 
                                                                                      path_star, 
                                                                                      whitelist_normal, 
                                                                                      libsize_normal)

    normalized_counts = get_junction_counts(junction_start, junction_end, 
                                            chrm, strand, expression_h5, 
                                            index_whitelist_samples, 
                                            lib_75_per_sample, normalizer)
    if normalized_counts is not None:
        for threshold in filter_thresholds: # Make filter threshold
            print(filter_recurrence(normalized_counts, threshold))


In [None]:
star_jx.loc[(star_jx['junction_start'] == junction_start) & (star_jx['junction_end'] == junction_end)]
