In [1]:
import pandas as pd
import os
import glob
import timeit
import numpy as np
import h5py
from collections import defaultdict
from helpers_STAR import * 
import argparse

In [None]:
##### MAIN #####
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='run specifications')
    parser.add_argument('--path-star', type=str, required=True, help='folder with star junctions')
    parser.add_argument('--big-matrix', type=str, required=False, default=None, help='Intermediate filtering matrix')
    parser.add_argument('--whitelist', type=str, required=True, help='whitelist of samples to include in recurrence analysis')
    parser.add_argument('--libsize', type=str, required=True, help='library size file')
    parser.add_argument('--normalizer', type=int, required=True, help='value to rescale after normalisation')
    parser.add_argument('--filter-thresholds', nargs='+', type=float, required=False, default=None, 
                        help='Number of reads to use as expression threshold in recurrence analysis')
    parser.add_argument('--jx-target-list', type=str, required=True, help='path to file with whitelist kmers coordinates')
    args = parser.parse_args()

    print(args)

In [None]:
# -------- Path STAR
path_star = arg.path_star
# Star junctions - unique coordinates
star_jx = os.path.join(path_star, 'junctions_spladder.all_coords.sorted.uniq.tsv.gz')
# Star junctions - projected coordinates and expression
#projected_chr_file = os.path.join(path_star, f'junctions_spladder_projected/junctions_spladder.projected.{chrm}.hdf5')


# -------- Intermediate filtering results (threshold and merged)
#TODO DO the generation matrix???
# Foreground matrix
big_matrix = arg.big_matrix



# -------- GTEX filtering 
whitelist = arg.whitelist
libsize = arg.libsize


normalizer = ar.normalizer
filter_thresholds = arg.filter_thresholds
#Optional if needs to process a junction list
jx_target_list = arg.jx_target_list

In [None]:
# -------- Load -------- 

# Load cancer generation matrix
mx = pd.read_csv(big_matrix, sep = '\t')
display(mx.head())
# Add split junction information to generation table
coord_mx = explode_immunopepper_coord(mx)
display(coord_mx.head())
mx = pd.concat([mx, coord_mx[['strand', 'junction_coordinate1', 'junction_coordinate2']]], axis = 1)
print('foreground matrix shape', mx.shape)

if jx_target_list:
    target_jx = pd.read_csv(jx_target_list, header = None) 
    target_jx.columns = ['coord']
    mx = mx.merge(target_jx, on = 'coord', how = 'inner')
print('foreground matrix shape', mx.shape)   
    
# LOAD STAR junctions
star_jx = pd.read_csv(star_jx, sep = '\t')
star_jx.head()
# Add STAR junction column 10 min
star_jx['junction_coordinate'] = star_jx['junction_start'].astype(str) + ':' + star_jx['junction_end'].astype(str)


# --------  Check junction presence in STAR --------  
#- 2-exons, 1st junction is present 
#- 3 exons, needs first and second junction present 

# Foreground Kmers from 1 junction
one_jx = mx[(mx['junction_coordinate1'] != 'None') & (mx['junction_coordinate2'] == 'None')]
print('foreground matrix one junction', one_jx.shape)

# Foreground Kmers from 2 junctions
two_jx = mx[(mx['junction_coordinate1'] != 'None') & (mx['junction_coordinate2'] != 'None')]
print('foreground matrix two junctions', two_jx.shape)

# Foreground 1 junction - NOT IN STAR 
isstar_one = set(one_jx['junction_coordinate1']).intersection(set(star_jx['junction_coordinate'])) # junction coordinates
print('foreground matrix one junction is in star', len(isstar_one))
is_star_one_jx = set(one_jx.set_index('junction_coordinate1').loc[isstar_one, 'coord']) # corresonding kmer coordinates
print('-> foreground matrix one-junction-kmers are in star', len(is_star_one_jx))

# Foreground 2 junctions - NOT IN STAR 
is_star_two1 = set(two_jx['junction_coordinate1']).intersection(set(star_jx['junction_coordinate'])) # Junction coordinates
is_star_two2 = set(two_jx['junction_coordinate2']).intersection(set(star_jx['junction_coordinate'])) # Junction coordinates
print('foreground matrix two junctions: left junction is in star', len(is_star_two1))
print('foreground matrix two junctions: right junction is in star', len(is_star_two2))

is_star_two_jx = set(two_jx.set_index('junction_coordinate1').loc[is_star_two1, 'coord']).intersection(\
set(two_jx.set_index('junction_coordinate2').loc[is_star_two2, 'coord'])) # Corresponding kmer coord
print('-> foreground matrix two-junctions-kmers are in star', len(is_star_two_jx))


# Foreground Table - Create FLAG GTEX junctions
start_time = timeit.default_timer()
mx['STAR_GTEx_jx'] = False
mx = mx.set_index('coord')
mx.loc[list(is_star_one_jx.union(is_star_two_jx)), 'STAR_GTEx_jx'] = True
mx = mx.reset_index()
print('time create STAR junction presence flag', timeit.default_timer() - start_time)    

# --------  Retrieve expression -------- 

# Extract metadata

# Select junctions to query in STAR file 
query = mx.loc[mx['STAR_GTEx_jx'] == True, ['strand', 'junction_coordinate1',
                                            'junction_coordinate2', 'STAR_GTEx_jx']].drop_duplicates()\
                                                                                    .reset_index()\
                                                                                    .drop('index', axis = 1)

assert(query.loc[:, ['junction_coordinate1', 'strand']].drop_duplicates().shape[0] == \
       query.loc[:, ['junction_coordinate1']].drop_duplicates().shape[0])

# Query-Junction to strand 
start_time = timeit.default_timer()
jx_strand = {}
for i in np.arange(query.shape[0]):
    jx_strand[query.loc[i, 'junction_coordinate1']] = query.loc[i, 'strand']

for i in np.arange(query.shape[0]):
    jx_strand[query.loc[i, 'junction_coordinate2']] = query.loc[i, 'strand']

jx_strand.pop('None')
print('Number of query junctions:', len(jx_strand))
print('time Query-junction to strand', timeit.default_timer() - start_time)

# Chromosome to query-junction
start_time = timeit.default_timer()
star_jx = star_jx.set_index('junction_coordinate') #Faster
intermediate = star_jx.loc[jx_strand.keys(), :]

chr_jx = defaultdict(set)
for jx, chrm in zip(intermediate.index, intermediate['chr']):
    chr_jx[chrm].add(jx)
print('chromosomes involved', chr_jx.keys())
print('Chromosome to query-junction', timeit.default_timer() - start_time)
    

### Collect Expression 

# -------- Thresholding experiments -------- 
# Remark 1 junction can have multiple chrm --> Which 
# Remark 1 junction on 1 chrm can have multiple strands --> Take from Immunopepper

print('Start collect expression')
res = collect_expression_thresholds(libsize, whitelist, normalizer,
                                    filter_thresholds, path_star, chr_jx, jx_strand )



Unnamed: 0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGA25131901A01R156513all,TCGA25131301A01R156513all,TCGA61200801A02R156813all,TCGA24143101A01R156613all,TCGA24229801A01R156913all,isAnnotated
0,RKSTQMPCT,1218.0,689.0,249.0,89.0,26.0,4.0,92379857:92379859:92611313:92611338:None:None,True,False,...,372,367,352,295,20.982616,8.422932,17.304198,16.069005,19.585264,
1,RKSTQMPCT,1279.0,274.0,70.0,28.0,5.0,1.0,92379857:92379859:92611313:92611338:None:None,True,False,...,372,367,352,295,20.982616,8.422932,17.304198,16.069005,19.585264,
2,RKSTQMPCT,7350.0,4622.0,2653.0,1745.0,1126.0,734.0,92379857:92379859:92611313:92611338:None:None,True,False,...,372,367,352,295,20.982616,8.422932,17.304198,16.069005,19.585264,
3,RKSTQMPCT,2025.0,479.0,96.0,35.0,8.0,1.0,92379857:92379859:92611313:92611338:None:None,True,False,...,372,367,352,295,20.982616,8.422932,17.304198,16.069005,19.585264,
4,RKSTQMPCT,8641.0,6957.0,4789.0,3211.0,1347.0,191.0,92379857:92379859:92611313:92611338:None:None,True,False,...,372,367,352,295,20.982616,8.422932,17.304198,16.069005,19.585264,


Unnamed: 0,0,1,2,3,4,5,strand,junction_coordinate1,junction_coordinate2,+first,+secon,-first,-secon
0,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
1,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
2,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
3,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
4,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313


foreground matrix shape (46132795, 27)
foreground matrix shape (542295, 27)
foreground matrix one junction (406600, 27)
foreground matrix two junctions (135695, 27)
foreground matrix one junction is in star 19382
-> foreground matrix one-junction-kmers are in star 162279
foreground matrix two junctions: left junction is in star 9809
foreground matrix two junctions: right junction is in star 9876
-> foreground matrix two-junctions-kmers are in star 62925
time create STAR junction presence flag 1.2987702080281451
Number of query junctions: 26460
time Query-junction to strand 0.6875324179418385
chromosomes involved dict_keys(['chr1', 'chr12', 'chr13', 'chr19', 'chrX', 'chr10', 'chr9', 'chr3', 'chr7', 'chr17', 'chr16', 'chr8', 'chr5', 'chr11', 'chr6', 'chr4', 'chr22', 'chr18', 'chr14', 'chr20', 'chr15', 'chr2', 'chrY', 'chr21', 'chrM'])
Chromosome to query-junction 71.80031096201856
Start collect expression
chr1


  normalized_counts = np.divide(raw_counts, lib_75_per_sample) * normalizer


....500
....1000
....1500
....2000
Error: No chr1:-:junction_start chr1:-:junction_start matching 155191880:155191939
Error: No chr1:-:junction_start chr1:-:junction_start matching 156742873:156742958
-8823.614046026953
chr12
....2500
....3000
Error: No chr12:+:junction_start chr12:+:junction_start matching 6537374:6537655
Error: No chr12:+:junction_start chr12:+:junction_start matching 6537708:6537870
Error: No chr12:+:junction_start chr12:+:junction_start matching 6770471:6770688
Error: No chr12:+:junction_start chr12:+:junction_start matching 6937330:6937507
....3500
....4000
-4295.845152077964
chr13
-486.2390420080628
chr19
....4500
....5000
Error: No chr19:+:junction_start chr19:+:junction_start matching 54207433:54207582
....5500
....6000
....6500
....7000
Error: No chr19:-:junction_start chr19:-:junction_start matching 1990046:1990163
....7500
....8000
....8500
Error: No chr19:+:junction_start chr19:+:junction_start matching 49173064:49173104
Error: No chr19:+:junction_start chr

In [1]:
len(jx_strand)

NameError: name 'jx_strand' is not defined

In [None]:
gtex_cols = ['gtexCohortfilter >0.0', 'gtexCohortfilter >=1.0',
       'gtexCohortfilter >=2.0', 'gtexCohortfilter >=3.0',
       'gtexCohortfilter >=5.0', 'gtexCohortfilter >=10.0']

# Expression to DF
df_res = pd.DataFrame(res, columns = ['junction_coordinate', 'strand_STAR', 'chr'] + gtex_cols)
print('Junction - expressin df shape', df_res.shape)
display(df_res.head())

#-------- Format results -------- 



# One junction merge (straightforward merge on junction)
print('foreground matrix one junction - BIS', one_jx.shape)
# Remove the GTEX quantifications from immunopepper
base_one_jx = one_jx.drop(gtex_cols, axis = 1).drop_duplicates() 
base_one_jx.head()
print('foreground matrix one junction no GTEx', base_one_jx.shape)
# Add GTEX quantifications from STAR
one_jx_quantified = base_one_jx.merge(df_res, left_on = 'junction_coordinate1', 
                                      right_on = 'junction_coordinate', how = 'left') 
print(one_jx_quantified.columns)
display(one_jx_quantified.tail())

# Two junctions merge (not straight forwrds, consider botrh junctions separately and take the max)
print('foreground matrix two junctions - BIS', two_jx.shape)
# Remove the GTEX quantifications from immunopepper
base_two_jx = two_jx.drop(gtex_cols, axis = 1).drop_duplicates() 
base_two_jx.head()
print('foreground matrix two junctions no GTEx', base_two_jx.shape)
# Add GTEX quantifications from STAR  # first junction
two_jx_quantified_left = base_two_jx.merge(df_res, left_on = 'junction_coordinate1', 
                                      right_on = 'junction_coordinate', how = 'left')


# Add GTEX quantifications from STAR  # second junction
two_jx_quantified_right = base_two_jx.merge(df_res, left_on = 'junction_coordinate2', 
                                      right_on = 'junction_coordinate', how = 'left')

col_merge = [col for col in two_jx_quantified_right if col not in gtex_cols]

# Add GTEX quantifications from STAR  # both junctions
two_jx_quantified = two_jx_quantified_left.merge(two_jx_quantified_right, on = col_merge, how = 'outer')

# Add GTEX quantifications from STAR  # MAX (!!) over 2 junctions
for col in gtex_cols:
    two_jx_quantified[col] = two_jx_quantified[[col + '_x', col + '_y']].max(skipna = True, axis = 1)
    two_jx_quantified = two_jx_quantified.drop([col + '_x', col + '_y'], axis = 1)
    
print(two_jx_quantified.columns)
display(two_jx_quantified.tail())

# All Junctions quantified on STAR for GTEX
print('foreground matrix one junction merged with expression', one_jx_quantified.shape)
print('foreground matrix two junctions merged with expression', two_jx_quantified.shape)

jx_quantified = pd.concat([one_jx_quantified, two_jx_quantified])
print('foreground matrix all merged with expression', jx_quantified.shape)

### Save 
save_path = big_matrix.replace('tsv.gz', 'STAR_GTEx.tsv.gz')
print('Saving to', save_path)
jx_quantified.to_csv(save_path, compression = 'gzip', index = False, sep = '\t')



In [2]:
# # -------- Path STAR
# path_star = '/cluster/work/grlab/projects/GTEx/rna_gencode32_realign/results'
# # Star junctions - unique coordinates
# star_jx = os.path.join(path_star, 'junctions_spladder.all_coords.sorted.uniq.tsv.gz')
# # Star junctions - projected coordinates and expression
# #projected_chr_file = os.path.join(path_star, f'junctions_spladder_projected/junctions_spladder.projected.{chrm}.hdf5')


# # -------- Intermediate filtering results (threshold and merged)
# #TODO DO the generation matrix???
# # Foreground matrix
# big_matrix = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_intermediate/complete_cancer_candidates_order_r_complete.tsv.gz'



# # -------- GTEX filtering 
# whitelist = '/cluster/work/grlab/projects/projects2020_OHSU/sample_lists/GTEX/GTEx_sample_IDs_10-2021_lib_graph_juliannelist'
# libsize = '/cluster/work/grlab/projects/TCGA/PanCanAtlas/immunopepper_paper/peptides_ccell_rerun_gtex_151220/ARCHIV_keep_runs/GTEX2019_commit_v3_TEST_merged3_372a147_medium_run_pya.0.17.1_conf2_annot_ref_chrall_cap/expression_counts.libsize.tsv'


# normalizer = 400000
# filter_thresholds = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]
# #Optional if needs to process a junction list
# jx_target_list = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_22March_order_wany_wAnnot/tmp_all_experiments_jx.txt'