In [1]:
import os
import sys
import pandas as pd
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
import numpy as np

# Read log files

In [2]:
directory = '../../capturec_test_run_nc_Hex/stats/'

# Initial duplication statistics

In [3]:
df = pd.read_csv(directory + 'deduplication_stats.tsv', sep='\t', index_col=0)
df

Unnamed: 0,Read_pairs_processed,Read_pairs_unique,Read_pairs_removed
HEX1,54719030,44477419,10241611
HEX2,36907169,32011554,4895615
HEX3,49405353,39462391,9942962


In [4]:
df_melt = (df.reset_index()
             .drop(columns='Read_pairs_processed')
             .melt(id_vars=['index'], var_name='stat', value_name='count')
             .rename(columns={'index': 'sample'})
             .assign(sample=lambda df: df['sample']))
(alt.Chart(df_melt)
    .mark_bar()
    .encode(x='count',
            y='sample',
            color='stat'))

# Read pair combination statistics

In [5]:
df = pd.read_csv(directory + 'combined_stats.tsv', sep='\t', index_col=0)
df = df.loc[lambda df: df['stat_type'].isin(['Flashed or unflashed'])]
df

Unnamed: 0,sample,read_type,stat_type,read_pairs
12,HEX1,flashed,Flashed or unflashed,37200037.0
13,HEX1,pe,Flashed or unflashed,6740007.0
14,HEX2,flashed,Flashed or unflashed,26853381.0
15,HEX2,pe,Flashed or unflashed,4770677.0
16,HEX3,flashed,Flashed or unflashed,33010034.0
17,HEX3,pe,Flashed or unflashed,5946293.0


In [6]:
(alt.Chart(df)
    .mark_bar()
    .encode(x='read_pairs',
            y='sample',
            color='read_type'))

# Fastq *in silico* digestion statistics

In [7]:
df = pd.read_csv(directory + 'digestion_stats.tsv', sep='\t', index_col=0)
df

Unnamed: 0,bin,stat,read_type,sample,frequency
0,0,valid,flashed,HEX1,18202869
1,0,valid,flashed,HEX2,12738198
2,0,valid,flashed,HEX3,16085156
3,0,valid,r1,HEX1,142
4,0,valid,r1,HEX2,89
...,...,...,...,...,...
135,9,valid,flashed,HEX3,3
136,10,total,flashed,HEX1,1
137,10,valid,flashed,HEX3,1
138,11,total,flashed,HEX3,1


In [8]:
df['n_slices'] = df['frequency'] * df['bin']
df_summarise = (df.groupby(['sample', 'read_type', 'stat'])
                  ['n_slices']
                  .sum()
                  .reset_index()
                  .assign(read_type=lambda df: df['read_type'].str.replace('r1', 'read_1').str.replace('r2', 'read_2')))

print('Total vs Valid slices (in silico digested read pairs)')
(alt.Chart(df_summarise)
    .mark_bar()
    .encode(x='n_slices',
            y='stat',
            color='read_type',
            row='sample'))

Total vs Valid slices (in silico digested read pairs)


In [9]:
df_hist = (df.loc[lambda df: df['stat'] == 'valid']
             [['sample', 'bin', 'read_type', 'frequency']]
             .assign(log10_count=lambda df: np.log10(df['frequency'] + 1e-12),
                     read_type=lambda df: df['read_type'].str.replace('r1', 'read_1').str.replace('r2', 'read_2'))
          )

(alt.Chart(df_hist)
    .mark_bar()
    .encode(x='bin:N',
            y='log10_count',
            color='read_type',
            row='sample',
            )
)

# CCanalyser_statistics

In [26]:
df = pd.read_csv(directory + 'ccanalyser_stats.tsv', sep='\t', index_col=0)
# split = (df.reset_index()
#            ['index']
#            .str.split('|', expand=True)
#            .rename(columns={}))

#df = pd.concat([df.reset_index(), split], axis=1, ignore_index=True)

In [27]:
df = df.reset_index().rename(columns={'index': 'filtering_step'})
df['filtering_step'] = df['filtering_step'].str.replace('|', ' ')
#df.columns = df.columns.str.capitalize().str.replace('_', ' ')
df

Unnamed: 0,filtering_step,mapped,multimapping_slices,number_of_capture_slices,number_of_slices_in_blacklisted_region,number_of_slices_in_exclusion_region,unique_capture_sites,unique_fragments,unique_slices
0,HEX1 flashed mapped,34324014,0,14492395,1013850,2148692,36,18864138,34034001
1,HEX1 flashed contains_single_capture,22911045,0,11247261,679963,1998621,36,11071466,22704698
2,HEX1 flashed contains_capture_and_reporter,17979023,0,8825347,0,0,35,8776142,17928859
3,HEX1 flashed duplicate_filtered,15096704,0,7390215,0,0,35,7387304,15051398
4,HEX1 pe mapped,11705741,0,4260898,388815,736297,36,5956045,11298829
5,HEX1 pe contains_single_capture,4074992,0,1906717,160582,503928,36,1446655,3904972
6,HEX1 pe contains_capture_and_reporter,2982835,0,1447825,0,0,35,1096433,2948985
7,HEX1 pe duplicate_filtered,2155161,0,996575,0,0,35,992211,2127475
8,HEX2 flashed mapped,25614884,0,11015957,775474,1594799,36,14027624,25402373
9,HEX2 flashed contains_single_capture,17577583,0,8627333,531337,1491775,36,8504479,17423573


# Overall stats

In [12]:
df = pd.read_csv(directory + 'combined_stats.tsv', sep='\t', index_col=0)
(alt.Chart(df)
    .mark_bar()
    .encode(x='read_pairs',
            y=alt.Y('stat_type', sort=None),
            color='read_type',
            order=alt.Order('read_type', sort='ascending'),
            row='sample')
    .interactive()
)