In [1]:
import pandas as pd
import os
import glob
import timeit
import numpy as np
import h5py
from collections import defaultdict

In [2]:
# -------- Path STAR
path_star = '/cluster/work/grlab/projects/GTEx/rna_gencode32_realign/results'
# Star junctions - unique coordinates
star_jx = os.path.join(path_star, 'junctions_spladder.all_coords.sorted.uniq.tsv.gz')
# Star junctions - projected coordinates and expression
#projected_chr_file = os.path.join(path_star, f'junctions_spladder_projected/junctions_spladder.projected.{chrm}.hdf5')


# -------- Intermediate filtering results (threshold and merged)
#TODO DO the generation matrix???
# Foreground matrix
big_matrix = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_intermediate/complete_cancer_candidates_order_r_complete.tsv.gz'



# -------- GTEX filtering 
whitelist = '/cluster/work/grlab/projects/projects2020_OHSU/sample_lists/GTEX/GTEx_sample_IDs_10-2021_lib_graph_juliannelist'
libsize = '/cluster/work/grlab/projects/TCGA/PanCanAtlas/immunopepper_paper/peptides_ccell_rerun_gtex_151220/ARCHIV_keep_runs/GTEX2019_commit_v3_TEST_merged3_372a147_medium_run_pya.0.17.1_conf2_annot_ref_chrall_cap/expression_counts.libsize.tsv'


normalizer = 400000
filter_thresholds = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]

# Load

In [3]:
def explode_immunopepper_coord(mx):
    coord_mx = mx['coord'].str.split(':', expand=True) #7 min

    coord_mx[1] = coord_mx[1].astype('int')
    coord_mx[2] = coord_mx[2].astype('int')

    coord_mx['strand'] = None
    coord_mx.loc[coord_mx[1] < coord_mx[2], 'strand'] = '+'
    coord_mx.loc[coord_mx[1] > coord_mx[2], 'strand'] = '-'

    coord_mx['junction_coordinate1'] = None
    coord_mx['junction_coordinate2'] = None


    coord_mx = coord_mx.astype(str) # 7 min

    coord_mx['+first'] = coord_mx[1] + ':' + coord_mx[2]
    coord_mx['+secon'] = coord_mx[3] + ':' + coord_mx[4]
    coord_mx['-first'] = coord_mx[3] + ':' + coord_mx[0]
    coord_mx['-secon'] = coord_mx[5] + ':' + coord_mx[2]

    coord_mx.loc[(coord_mx['strand'] == '+'), 'junction_coordinate1'] = coord_mx['+first'] 
    coord_mx.loc[(coord_mx['strand'] == '-'), 'junction_coordinate1'] = coord_mx['-first'] 
    coord_mx.loc[(coord_mx['strand'] == '+') & (coord_mx[4] != 'None') , 'junction_coordinate2'] = coord_mx['+secon']
    coord_mx.loc[(coord_mx['strand'] == '-') & (coord_mx[4] != 'None') , 'junction_coordinate2'] = coord_mx['-secon']
    
    return coord_mx

In [4]:
# Load cancer generation matrix
mx = pd.read_csv(big_matrix, sep = '\t')
display(mx.head())
# Add split junction information to generation table
coord_mx = explode_immunopepper_coord(mx)
display(coord_mx.head())
mx = pd.concat([mx, coord_mx[['strand', 'junction_coordinate1', 'junction_coordinate2']]], axis = 1)

Unnamed: 0,0,1,2,3,4,5,strand,junction_coordinate1,junction_coordinate2,+first,+secon,-first,-secon
0,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
1,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
2,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
3,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313
4,92379857,92379859,92611313,92611338,,,+,92379859:92611313,,92379859:92611313,92611338:None,92611338:92379857,None:92611313


In [5]:
# LOAD STAR junctions
star_jx = pd.read_csv(star_jx, sep = '\t')
star_jx.head()
# Add STAR junction column 10 min
star_jx['junction_coordinate'] = star_jx['junction_start'].astype(str) + ':' + star_jx['junction_end'].astype(str)


# Check junction presence in STAR 
- 2-exons, 1st junction is present 
- 3 exons, needs first and second junction present 

In [6]:
# Foreground Kmers from 1 junction
one_jx = mx[(mx['junction_coordinate1'] != 'None') & (mx['junction_coordinate2'] == 'None')]
print(one_jx.shape)

(44293227, 27)


In [7]:
# Foreground Kmers from 2 junctions
two_jx = mx[(mx['junction_coordinate1'] != 'None') & (mx['junction_coordinate2'] != 'None')]
print(two_jx.shape)

(1839568, 27)


In [8]:
# Foreground 1 junction - NOT IN STAR 
isstar_one = set(one_jx['junction_coordinate1']).intersection(set(star_jx['junction_coordinate'])) # junction coordinates
print(len(isstar_one))
is_star_one_jx = set(one_jx.set_index('junction_coordinate1').loc[isstar_one, 'coord']) # corresonding kmer coordinates
print(len(is_star_one_jx))

208958
1992044


In [9]:
# Foreground 2 junctions - NOT IN STAR 
is_star_two1 = set(two_jx['junction_coordinate1']).intersection(set(star_jx['junction_coordinate'])) # Junction coordinates
is_star_two2 = set(two_jx['junction_coordinate2']).intersection(set(star_jx['junction_coordinate'])) # Junction coordinates
print(len(is_star_two1))
print(len(is_star_two2))

is_star_two_jx = set(two_jx.set_index('junction_coordinate1').loc[is_star_two1, 'coord']).intersection(\
set(two_jx.set_index('junction_coordinate2').loc[is_star_two2, 'coord'])) # Corresponding kmer coord
print(len(is_star_two_jx))

13190
13720
102528


In [10]:
print(len(is_star_one_jx))
print(len(is_star_two_jx))

1992044
102528


In [11]:
# Foreground Table - Create FLAG GTEX junctions
start_time = timeit.default_timer()
mx['STAR_GTEx_jx'] = False
mx = mx.set_index('coord')
mx.loc[list(is_star_one_jx.union(is_star_two_jx)), 'STAR_GTEx_jx'] = True
mx = mx.reset_index()
print(timeit.default_timer() - start_time)    

247.36660489800852


# Retrieve expression 

### Extract metadata

In [12]:
# Select junctions to query in STAR file 
query = mx.loc[mx['STAR_GTEx_jx'] == True, ['strand', 'junction_coordinate1',
                                            'junction_coordinate2', 'STAR_GTEx_jx']].drop_duplicates()\
                                                                                    .reset_index()\
                                                                                    .drop('index', axis = 1)

assert(query.loc[:, ['junction_coordinate1', 'strand']].drop_duplicates().shape[0] == \
       query.loc[:, ['junction_coordinate1']].drop_duplicates().shape[0])

In [13]:
# Query-Junction to strand 
jx_strand = {}
for i in np.arange(query.shape[0]):
    jx_strand[query.loc[i, 'junction_coordinate1']] = query.loc[i, 'strand']

for i in np.arange(query.shape[0]):
    jx_strand[query.loc[i, 'junction_coordinate2']] = query.loc[i, 'strand']

jx_strand.pop('None')

'-'

In [14]:
# Chromosome to query-junction
star_jx = star_jx.set_index('junction_coordinate') #Faster

intermediate = star_jx.loc[jx_strand.keys(), :]

start_time = timeit.default_timer()
chr_jx = defaultdict(set)
for jx, chrm in zip(intermediate.index, intermediate['chr']):
    chr_jx[chrm].add(jx)
print(timeit.default_timer() - start_time)
    

0.11621462600305676


In [15]:
######tests

In [88]:
#len(set(star_jx['junction_coordinate']))

80525491

In [92]:
#foo = star_jx.groupby(['junction_coordinate', 'chr']).count()

In [102]:
# len((foo.index))

80550666

In [108]:
# jx_count = {}
# for i in foo.index:
#     if i[0] not in jx_count:
#         jx_count[i[0]]=1
#     else: 
#         jx_count[i[0]]+=1
    

In [109]:
# select = []
# for i, j in jx_count.items():
#     if j > 1:
#         select.append(i)

In [115]:
# chr_test = star_jx.set_index('junction_coordinate').loc[select]

In [112]:
# star_jx.loc[(star_jx['junction_coordinate'] == select[0]), :]

Unnamed: 0,chr,strand,junction_start,junction_end,junction_coordinate
78107623,chrX,+,1000293,1189987,1000293:1189987
80346765,chrY,+,1000293,1189987,1000293:1189987


In [118]:
# chr_test.loc[chr_test]


Unnamed: 0_level_0,chr,strand,junction_start,junction_end
junction_coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
999357:1350993,chrY,+,999357,1350993
999377:1030776,chrX,-,999377,1030776
999377:1030776,chrY,-,999377,1030776
999771:999866,chr1,-,999771,999866
999771:999866,chr4,+,999771,999866


In [148]:
# chr_test.loc[(chr_test['chr']!= 'chrX') & (chr_test['chr']!= 'chrY') ].iloc[0:50]

Unnamed: 0_level_0,chr,strand,junction_start,junction_end
junction_coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100153424:100153456,chr7,+,100153424,100153456
100153424:100153456,chr8,+,100153424,100153456
100498207:100499066,chr10,-,100498207,100499066
100498207:100499066,chr1,+,100498207,100499066
1005398:1005446,chr4,-,1005398,1005446
1005398:1005446,chr5,+,1005398,1005446
100555227:100555370,chr15,-,100555227,100555370
100555227:100555370,chr3,+,100555227,100555370
100555227:100558404,chr15,-,100555227,100558404
100555227:100558404,chr3,+,100555227,100558404


In [141]:
#chr_test.loc[ chr_test.loc[:, ['strand', 'junction_start', 'junction_end']].duplicated(), 'chr'].unique()

array(['chrY', 'chr8', 'chr14', 'chrM', 'chr4', 'chr19', 'chr12', 'chr11',
       'chr2', 'chr7', 'chr5', 'chr20', 'chr9', 'chr6', 'chrX', 'chr13',
       'chr3', 'chr16', 'KI270938.1', 'KI270879.1', 'chr17', 'chr1',
       'KI270913.1', 'chr22', 'chr21', 'KI270928.1', 'chr18', 'chr15',
       'KI270872.1', 'GL949752.1'], dtype=object)

In [117]:
#chr_test['chr'].unique()

array(['chrX', 'chrY', 'chr7', 'chr8', 'chr10', 'chr1', 'chr4', 'chr5',
       'chr15', 'chr3', 'chr14', 'chrM', 'chr11', 'chr19', 'chr12',
       'chr17', 'chr2', 'chr9', 'chr20', 'chr13', 'chr6', 'chr16',
       'KV766196.1', 'GL949752.1', 'KI270938.1', 'chr18', 'KI270879.1',
       'KI270913.1', 'GL949747.2', 'chr22', 'GL949753.2', 'chr21',
       'GL383582.2', 'KI270928.1', 'KI270872.1', 'GL949746.1',
       'KV575245.1'], dtype=object)

### Collect Expression 

In [16]:
def read_libsize_whitelist(libsize, whitelist):
    # Read libsize and whitelist
    libsize_normal = pd.read_csv(libsize, sep = '\t')
    whitelist_normal = pd.read_csv(whitelist, sep = '\t', header = None)
    whitelist_normal.columns = ['sample']
    return libsize_normal, whitelist_normal

In [17]:
def preprocess_STAR_projected(chrm, path_star, whitelist_normal, libsize_normal):
    # Star junctions - projected coordinates and expression
    projected_chr_file = os.path.join(path_star, f'junctions_spladder_projected/junctions_spladder.projected.{chrm}.hdf5')
    star_expr = h5py.File(projected_chr_file, 'r')

    # Whitelist samples
    index_whitelist_samples = [s for s, sample in enumerate(star_expr['samples']) if sample.decode() + 'all' in whitelist_normal['sample'].values]
    samples_decoded = [sample.decode() + 'all' for s, sample in enumerate(star_expr['samples']) if sample.decode() + 'all' in whitelist_normal['sample'].values]

    # Libsize sampple     
    lib_75_per_sample = libsize_normal.set_index('sample').loc[samples_decoded, 'libsize_75percent'].values

    return star_expr, index_whitelist_samples, lib_75_per_sample


In [18]:
def get_junction_counts(junction_start, junction_end, 
                        chrm, strand, expression_h5, 
                        index_whitelist_samples, lib_75_per_sample, normalizer):
    # Extract junction ID in hdf5 file #TODO SPEEDUP WITH SETS
    jx_idx_h5 = np.where((junction_start == expression_h5[f'{chrm}:{strand}:junction_start'][...]) & 
            (junction_end == expression_h5[f'{chrm}:{strand}:junction_end'][...]))[0]
    
    if len(jx_idx_h5) > 0:
        assert(len(jx_idx_h5) == 1) #TODO check critical
        jx_idx_h5 = jx_idx_h5[0]

        # Extract Raw count in hdf5 file
        raw_counts = expression_h5[f'{chrm}:{strand}:count'][jx_idx_h5, np.array(index_whitelist_samples)]
        normalized_counts = np.divide(raw_counts, lib_75_per_sample) * normalizer
    else:
        normalized_counts = None
        print(f'Error: No {chrm}:{strand}:junction_start {chrm}:{strand}:junction_start matching {junction_start}:{junction_end}')
    return normalized_counts

In [19]:
def filter_recurrence(array_, threshold):
    if threshold == 0:
        return np.sum(array_ > threshold)
    else:
        return np.sum(array_ >= threshold)

In [20]:
def filter_multi_thresholds(normalized_counts, filter_thresholds):
    recurrence = []
    for threshold in filter_thresholds: # Make filter threshold
        recurrence.append(filter_recurrence(normalized_counts, threshold))
    return recurrence 


In [21]:
### Thresholding experiments 
# Remark 1 junction can have multiple chrm --> Which 
# Remark 1 junction on 1 chrm can have multiple strands --> Take from Immunopepper

res = []
libsize_normal, whitelist_normal = read_libsize_whitelist(libsize, whitelist)
counter = 0
for chrm, jxS in chr_jx.items(): # Per chromosome
    print(chrm)
    # Expression file
    expression_h5, index_whitelist_samples, lib_75_per_sample = preprocess_STAR_projected(chrm, 
                                                                                      path_star, 
                                                                                      whitelist_normal, 
                                                                                      libsize_normal)
    start_time = timeit.default_timer()    
    for jx in jxS: # Per junction
        counter +=1
        if counter % 500 == 0:
            print(f'....{counter}')
        junction_start = int(jx.split(':')[0])
        junction_end = int(jx.split(':')[1])
        normalized_counts = get_junction_counts(junction_start, junction_end, 
                                                chrm, jx_strand[jx], expression_h5, 
                                                index_whitelist_samples, 
                                                lib_75_per_sample, normalizer)
        
        if normalized_counts is not None:
            metadata = [jx, jx_strand[jx], chrm] + filter_multi_thresholds(normalized_counts, filter_thresholds)
        
        res.append(metadata)
    
    expression_h5.close() 
    print(start_time - timeit.default_timer())

chr9


  normalized_counts = np.divide(raw_counts, lib_75_per_sample) * normalizer


....500
....1000
....1500
....2000
....2500
....3000
....3500
....4000
....4500
....5000
....5500
....6000
Error: No chr9:-:junction_start chr9:-:junction_start matching 127507186:127507342
....6500
....7000
....7500
....8000
....8500
-13237.52022917592
chr14
....9000
....9500
....10000
....10500
....11000
....11500
....12000
....12500
....13000
....13500
....14000
....14500
....15000
-8053.89567177603
chrX
....15500
....16000
....16500
....17000


KeyboardInterrupt: 

In [39]:
tot = 0 
for chrm, jxS in chr_jx.items():
    print(f'chrm number junctions {len(jxS)}')
    tot+= len(jxS)
print(f'Total number junctions {tot}')

chrm number junctions 8778
chrm number junctions 6377
chrm number junctions 5554
chrm number junctions 7228
chrm number junctions 22420
chrm number junctions 11530
chrm number junctions 9538
chrm number junctions 11815
chrm number junctions 5783
chrm number junctions 2454
chrm number junctions 2914
chrm number junctions 18994
chrm number junctions 14150
chrm number junctions 10065
chrm number junctions 13964
chrm number junctions 7976
chrm number junctions 5495
chrm number junctions 6594
chrm number junctions 9060
chrm number junctions 2400
chrm number junctions 13746
chrm number junctions 7737
chrm number junctions 5244
chrm number junctions 212
chrm number junctions 360
Total number junctions 210388


In [45]:
# Expression to DF
df_res = pd.DataFrame(res, columns = ['junction_coordinate', 'strand_STAR', 'chr'] + gtex_cols)
print(df_res.shape)
display(df_res.head())

### Format results

In [153]:
gtex_cols = ['gtexCohortfilter >0.0', 'gtexCohortfilter >=1.0',
       'gtexCohortfilter >=2.0', 'gtexCohortfilter >=3.0',
       'gtexCohortfilter >=5.0', 'gtexCohortfilter >=10.0']

In [154]:
# One junction merge (straightforward merge on junction)
print(one_jx.shape)
# Remove the GTEX quantifications from immunopepper
base_one_jx = one_jx.drop(gtex_cols, axis = 1).drop_duplicates() 
base_one_jx.head()
print(base_one_jx.shape)
# Add GTEX quantifications from STAR
one_jx_quantified = base_one_jx.merge(df_res, left_on = 'junction_coordinate1', 
                                      right_on = 'junction_coordinate', how = 'left') 
print(one_jx_quantified.columns)
display(one_jx_quantified.tail())

(44293227, 27)
(2431499, 21)
Index(['kmer', 'coord', 'junctionAnnotated', 'readFrameAnnotated',
       'isCrossJunction', 'batch', 'cancerCohortfilter >0.0',
       'cancerCohortfilter >=1.0', 'cancerCohortfilter >=2.0',
       'cancerCohortfilter >=3.0', 'cancerCohortfilter >=5.0',
       'cancerCohortfilter >=10.0', 'TCGA25131901A01R156513all',
       'TCGA25131301A01R156513all', 'TCGA61200801A02R156813all',
       'TCGA24143101A01R156613all', 'TCGA24229801A01R156913all', 'isAnnotated',
       'strand', 'junction_coordinate1', 'junction_coordinate2',
       'junction_coordinate', 'strand_STAR', 'chr', 'gtexCohortfilter >0.0',
       'gtexCohortfilter >=1.0', 'gtexCohortfilter >=2.0',
       'gtexCohortfilter >=3.0', 'gtexCohortfilter >=5.0',
       'gtexCohortfilter >=10.0'],
      dtype='object')


Unnamed: 0,kmer,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,cancerCohortfilter >0.0,cancerCohortfilter >=1.0,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,...,junction_coordinate2,junction_coordinate,strand_STAR,chr,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0
2431504,ETRSPEKSV,121185317:121185337:121185025:121185032:None:None,False,True,True,2858,265,258,222,183,...,,,,,,,,,,
2431505,SFKDRHPSY,121183457:121183471:121183340:121183353:None:None,False,True,True,2858,63,35,12,5,...,,,,,,,,,,
2431506,KDSAVDFTG,121183457:121183465:121181329:121181348:None:None,True,True,True,2858,186,136,68,31,...,,,,,,,,,,
2431507,WLLLLSLFL,121181270:121181272:121180855:121180880:None:None,False,False,True,2858,44,28,10,4,...,,,,,,,,,,
2431508,GGETRSPEN,121185317:121185343:121181347:121181348:None:None,True,True,True,2858,78,36,12,4,...,,,,,,,,,,


In [203]:
# Two junctions merge (not straight forwrds, consider botrh junctions separately and take the max)
print(two_jx.shape)
# Remove the GTEX quantifications from immunopepper
base_two_jx = two_jx.drop(gtex_cols, axis = 1).drop_duplicates() 
base_two_jx.head()
print(base_two_jx.shape)
# Add GTEX quantifications from STAR  # first junction
two_jx_quantified_left = base_two_jx.merge(df_res, left_on = 'junction_coordinate1', 
                                      right_on = 'junction_coordinate', how = 'left')


# Add GTEX quantifications from STAR  # second junction
two_jx_quantified_right = base_two_jx.merge(df_res, left_on = 'junction_coordinate2', 
                                      right_on = 'junction_coordinate', how = 'left')

col_merge = [col for col in two_jx_quantified_right if col not in gtex_cols]

# Add GTEX quantifications from STAR  # both junctions
two_jx_quantified = two_jx_quantified_left.merge(two_jx_quantified_right, on = col_merge, how = 'outer')

# Add GTEX quantifications from STAR  # MAX (!!) over 2 junctions
for col in gtex_cols:
    two_jx_quantified[col] = two_jx_quantified[[col + '_x', col + '_y']].max(skipna = True, axis = 1)
    two_jx_quantified = two_jx_quantified.drop([col + '_x', col + '_y'], axis = 1)
    
print(two_jx_quantified.columns)
display(two_jx_quantified.tail())

(1839568, 27)
(188025, 21)
Index(['kmer', 'coord', 'junctionAnnotated', 'readFrameAnnotated',
       'isCrossJunction', 'batch', 'cancerCohortfilter >0.0',
       'cancerCohortfilter >=1.0', 'cancerCohortfilter >=2.0',
       'cancerCohortfilter >=3.0', 'cancerCohortfilter >=5.0',
       'cancerCohortfilter >=10.0', 'TCGA25131901A01R156513all',
       'TCGA25131301A01R156513all', 'TCGA61200801A02R156813all',
       'TCGA24143101A01R156613all', 'TCGA24229801A01R156913all', 'isAnnotated',
       'strand', 'junction_coordinate1', 'junction_coordinate2',
       'junction_coordinate', 'strand_STAR', 'chr', 'gtexCohortfilter >0.0',
       'gtexCohortfilter >=1.0', 'gtexCohortfilter >=2.0',
       'gtexCohortfilter >=3.0', 'gtexCohortfilter >=5.0',
       'gtexCohortfilter >=10.0'],
      dtype='object')


Unnamed: 0,kmer,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,cancerCohortfilter >0.0,cancerCohortfilter >=1.0,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,...,junction_coordinate2,junction_coordinate,strand_STAR,chr,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0
196939,QSPYPCHWW,154464496:154464501:154464774:154464794:154465...,False,False,True,59971,14,5,1,0,...,154464794:154465017,154464794:154465017,+,chrX,1398.0,1358.0,757.0,443.0,125.0,7.0
196940,KLLHKFLKE,154468542:154468545:154468708:154468729:154468...,True,False,True,59971,31,12,3,1,...,154468729:154468822,154468729:154468822,+,chrX,9447.0,9447.0,9446.0,9438.0,9400.0,9280.0
196941,HTKTMQVIS,34609637:34609646:34608029:34608046:34605716:3...,True,False,True,16207,13,9,4,0,...,34605717:34608029,34605717:34608029,-,chr14,9475.0,9475.0,9475.0,9475.0,9475.0,9473.0
196942,TKTMQVISF,34609637:34609643:34608029:34608046:34605713:3...,True,False,True,16207,13,9,4,0,...,34605717:34608029,34605717:34608029,-,chr14,9475.0,9475.0,9475.0,9475.0,9475.0,9473.0
196943,KTMQVISFH,34609637:34609640:34608029:34608046:34605710:3...,True,False,True,16207,13,9,4,0,...,34605717:34608029,34605717:34608029,-,chr14,9475.0,9475.0,9475.0,9475.0,9475.0,9473.0


In [211]:
# All Junctions quantified on STAR for GTEX
print(two_jx_quantified.shape)
print(one_jx_quantified.shape)
jx_quantified = pd.concat([one_jx_quantified, two_jx_quantified])
print(jx_quantified.shape)

(2628453, 30)


In [240]:
### Save 
save_path = big_matrix.replace('tsv.gz', 'STAR_GTEx.tsv.gz')
print(save_path)
jx_quantified.to_csv(save_path, compression = 'gzip', index = False, sep = '\t')

/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_intermediate/complete_cancer_candidates_order_r_complete.STAR_GTEx.tsv.gz


# Quality check

In [174]:
chrm_list = ['chr2', 'chr14']
strand = '-'
jx = '102015975:102016000'
junction_start = int(jx.split(':')[0])
junction_end = int(jx.split(':')[1])

for chrm in chrm_list:
    expression_h5, index_whitelist_samples, lib_75_per_sample = preprocess_STAR_projected(chrm, 
                                                                                      path_star, 
                                                                                      whitelist_normal, 
                                                                                      libsize_normal)

    normalized_counts = get_junction_counts(junction_start, junction_end, 
                                            chrm, strand, expression_h5, 
                                            index_whitelist_samples, 
                                            lib_75_per_sample, normalizer)
    if normalized_counts is not None:
        for threshold in filter_thresholds: # Make filter threshold
            print(filter_recurrence(normalized_counts, threshold))


  normalized_counts = np.divide(raw_counts, lib_75_per_sample) * normalizer


1
1
1
1
1
1
4
4
4
4
2
1


In [165]:
star_jx.loc[(star_jx['junction_start'] == junction_start) & (star_jx['junction_end'] == junction_end)]


Unnamed: 0,chr,strand,junction_start,junction_end,junction_coordinate
19214000,chr14,-,102015975,102016000,102015975:102016000
46460381,chr2,-,102015975,102016000,102015975:102016000
