In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import tarfile
import glob
from matplotlib.colors import LogNorm


Matplotlib created a temporary config/cache directory at /scratch/slurm-job.854520/matplotlib-s6w81xre because the default path (/cluster/customapps/biomed/grlab/users/prelotla/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [None]:
tar_file_OHSU = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/expanded-filters_OHSU-results_Mar19-2023/OHSU_March19-2023_expanded-filters_results.tar.gz'

In [None]:
base_eth = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order'

In [None]:
one_file_eth = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'

In [None]:
# Reads > 0 or 1 sample any read
# Any read would be > 0 

In [None]:
# ETH Names
eth_all = glob.glob(os.path.join(base_eth, 'G*'))

In [None]:
# OHSU Names
with tarfile.open(tar_file_OHSU, "r:*") as tar:
    ohsu_all = tar.getnames()

In [None]:
# Get file pairs
file_pair = {}
for idx_eth, eth in enumerate(eth_all):
    pattern = os.path.basename(eth).replace('G_', '').replace('.gz', '') #.replace('.0CohortLim','CohortLim')
    for idx_ohsu, ohsu in enumerate(ohsu_all):
        if pattern in ohsu:
            file_pair[eth] = ohsu

            

In [None]:
len(file_pair)

# Collect Data to plot

In [None]:
# sample = TCGAAOA0JM01A21RA05607all
restrict = 'TCGA-AO-A0JM-01A-21R-A056-07'
df = {'sample' : [], 
      'filter_foreground' : [], 
      'filter_background' : [], 
      'size_ohsu' : [], 
      'size_eth' : [], 
      'size_intersection' : [], 
      'size_ohsu\eth' : [], 
      'size_eth\ohsu' : []}
with tarfile.open(tar_file_OHSU, "r:*") as tar: #OHSU
    for eth, ohsu in file_pair.items(): # ETH
        if (not restrict) or restrict in eth: #Restrict to category of interest
            try:
                df_ohsu = pd.read_csv(tar.extractfile(ohsu), header=0, sep="\t")
                #print(eth)
                df_eth = pd.read_csv(eth, sep="\t", usecols = ['kmer'])
                #print(df_eth.shape)
                df_eth = set(df_eth['kmer'])
                #print(len(df_eth))
                df_ohsu = set(df_ohsu['kmer'])
                name = os.path.basename(ohsu).replace('.tsv', '').split('_')
                df['sample'].append(name[1])
                df['filter_foreground'].append(name[2])
                df['filter_background'].append(name[3])
                df['size_ohsu'].append(len(df_ohsu))
                df['size_eth'].append(len(df_eth))
                df['size_ohsu\eth'].append(len(df_ohsu.difference(df_eth)))
                df['size_eth\ohsu'].append(len(df_eth.difference(df_ohsu)))
                df['size_intersection'].append(len(df_ohsu.intersection(df_eth)))
            except:
                continue


In [None]:
df = pd.DataFrame(df)

# Path Tests

In [None]:
back_ = set()
foregr_ = set()
for i in eth_all:
    if 'FiltNormalsGtexC' in i:
        if 'TCGA-AO-A0JM-01A-21R-A056-07' in i:
            back_.add(os.path.basename(i).split('_')[3])
            foregr_.add(os.path.basename(i).split('_')[2])
print('foreground')
for i in foregr_:
    print(i)
print('background')
for i in back_:
    print(i)

In [None]:
# FiltNormalsGtexCohortCohortlim0.0Across0.tsv


# Plot Set size

In [None]:
df = df.sort_values('size_intersection')
df['index'] = np.arange(len(df))

In [None]:
def plot_text(Y, T):
    Y = np.array(Y)
    T = np.array(T)
    change_val = [i for i in np.arange(len(Y) - 1) if Y[i] != Y[i - 1]]    
    weighted = [change_val[i] + (change_val[i+1] - change_val[i]) / 2 for i, x in enumerate(change_val[:-1])]
    X = [np.floor(change_val[i] + (change_val[i+1] - change_val[i]) / 2) for i, x in enumerate(change_val[:-1])]
    Y = Y[np.array(change_val[:-1])]
    T = T[np.array(change_val[:-1])]
    p_prev = 0 
    percent_diff = 6
    
    for x, y, p in zip(X, Y, T):
        if (p > p_prev + (p_prev/percent_diff)) or (p < p_prev - (p_prev/percent_diff)):
            plt.text(x - 0.5 , y + (y/10), p)
        p_prev = p 

In [None]:
def plot_text_all(X, Y, T):
    for x, y, p in zip(X, Y, T):
        plt.text(x, y, p)
        #plt.text(x - 0.5 , y + (y/10), p)


In [None]:
colorgrid = 'grey'
alpha_grid = 0.3
marker_size = 10
mew = 4
color1 = 'gold'
color2 = 'crimson'
color4 = 'mediumblue'
colorgrid = 'grey'

plt.figure(figsize=(15, 6))
plt.grid(b=True, axis = 'both', which='major', color=colorgrid, linestyle='-', alpha=alpha_grid)
plt.grid(b=False, axis = 'both', which='minor', color=colorgrid, linestyle='--', alpha=alpha_grid)
width = 0.4
plt.bar(df['index'], df['size_intersection'], width=width, 
        color=color1, label='Intersection size')
plt.plot(df['index'], df['size_eth'], alpha=1, color=color4,
         linestyle = 'None', markerfacecolor='None', marker="_", markersize=marker_size, markeredgewidth=mew,
         label = 'Total set size Graph Pipeline')
plt.plot(df['index'], df['size_ohsu'], alpha=1, color=color2,
         linestyle = 'None', markerfacecolor='None', marker="_", markersize=marker_size, markeredgewidth=mew,
         label = 'Total set size Junction Pipeline')

plot_text(df['size_ohsu'], df['size_ohsu'])
plot_text(df['size_eth'], df['size_eth'])
#plot_text_all(df['index'], df['size_intersection'], df['size_intersection'])

plt.xticks(np.arange(len(df['index'])))
plt.legend()
plt.ylabel('Number of kmers')
plt.xlabel('Experiement Number')
plt.yscale('log')

# Plot Intersections

In [None]:
df.columns

In [None]:
foreg_expr = [None, 0, 2, 0, 2]
foreg_sample = [None, 1, 1, 5, 5]

In [None]:
# plt.plot(foreg_expr, np.arange(len(foreg_expr)), linestyle = 'None', marker="o")
# plt.plot(foreg_sample, np.arange(len(foreg_sample)), linestyle = 'None', marker="x")
# plt.invert_yaxis()

In [None]:

order_foreground = ['SampleLim0.0CohortLimNoneAcrossNone', 'SampleLim0.0CohortLim0.0Across1',
           'SampleLim0.0CohortLim2.0Across1', 'SampleLim0.0CohortLim0.0Across5',
           'SampleLim0.0CohortLim2.0Across5']
order_background = ['FiltNormalsGtexCohortCohortlim3.0Across10', 
                     'FiltNormalsGtexCohortCohortlim3.0Across2', 
                     'FiltNormalsGtexCohortCohortlim1.0Across10',
                     'FiltNormalsGtexCohortCohortlim0.0Across10', 
                     'FiltNormalsGtexCohortCohortlim1.0Across2', 
                     'FiltNormalsGtexCohortCohortlim0.0Across2',
                     'FiltNormalsGtexCohortCohortlim3.0Across1', 
                     'FiltNormalsGtexCohortCohortlim1.0Across1', 
                     'FiltNormalsGtexCohortCohortlim0.0Across1']

titles = ['Intersection size', 
'Total set size Graph Pipeline', 
'Total set size Junction Pipeline']
df_plot = []
for f in ['size_intersection', 'size_eth', 'size_ohsu']: #, 'size_ohsu\eth', 'size_eth\ohsu']:
    # Extract the intersection
    df_inter = df.pivot(index = 'filter_foreground', columns = 'filter_background', values = f )

    # Hardcode the leniency of the filter
    df_inter = df_inter.loc[order_foreground, order_background]
    df_plot.append(df_inter)
    


In [None]:
nprow = 1
npcol = 3
fig = plt.figure(figsize=(15, 6), layout='constrained')
axes = [fig.add_subplot(nprow, npcol, i) for i in range(1,npcol+1)]

vmin = 1
vmax = 1e5
for pane_idx in np.arange(npcol):
    panelplot=axes[pane_idx].imshow(df_plot[pane_idx], origin='upper', 
                                    cmap='inferno_r', norm=LogNorm(vmin=1, vmax=1e4))
    axes[pane_idx].set_title(titles[pane_idx])
    axes[pane_idx].set_xticks(range(df_plot[pane_idx].shape[1]))
    axes[pane_idx].set_xticklabels(df_plot[pane_idx].columns, rotation = 90 )
    if pane_idx == 0 :
        axes[pane_idx].set_yticks(range(df_plot[pane_idx].shape[0]))
        axes[pane_idx].set_yticklabels(df_plot[pane_idx].index, rotation = 45 )
plt.colorbar(panelplot)



In [None]:
df_plot[1]

In [None]:
df_plot[2]