In [57]:
import os
import sys
import pandas as pd
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
import numpy as np

# Read log files

In [1]:
directory = '../../capturec_test_run_nc_Hex/stats/'

# Initial duplication statistics

In [18]:
df = pd.read_csv(directory + 'deduplication_stats.tsv', sep='\t', index_col=0)
df

Unnamed: 0,Read_pairs_processed,Read_pairs_unique,Read_pairs_removed
HEX1,54719030,44477418,10241611
HEX2,36907169,32011553,4895615
HEX3,49405353,39462390,9942962


In [20]:
df_melt = (df.reset_index()
             .drop(columns='Read_pairs_processed')
             .melt(id_vars=['index'], var_name='stat', value_name='count')
             .rename(columns={'index': 'sample'})
             .assign(sample=lambda df: df['sample']))
(alt.Chart(df_melt)
    .mark_bar()
    .encode(x='count',
            y='sample',
            color='stat'))

# Read pair combination statistics

In [30]:
df = pd.read_csv(directory + 'combined_stats.tsv', sep='\t', index_col=0)
df = df.loc[lambda df: df['stat_type'].isin(['Flashed or unflashed'])]
df

Unnamed: 0,sample,read_type,stat_type,read_pairs
12,HEX1,flashed,Flashed or unflashed,37575235.0
13,HEX1,pe,Flashed or unflashed,6807463.0
14,HEX2,flashed,Flashed or unflashed,27133326.0
15,HEX2,pe,Flashed or unflashed,4820668.0
16,HEX3,flashed,Flashed or unflashed,33348570.0
17,HEX3,pe,Flashed or unflashed,6006281.0


In [33]:
(alt.Chart(df)
    .mark_bar()
    .encode(x='read_pairs',
            y='sample',
            color='read_type'))

# Fastq *in silico* digestion statistics

In [64]:
df = pd.read_csv(directory + 'digestion_stats.tsv', sep='\t', index_col=0)
df

Unnamed: 0_level_0,read_type,0,1,10,2,3,4,5,6,7,8,9,total_read_pairs_processed,total_slices,total_valid_slices
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
HEX1,flashed,18386219.0,3854620.0,0.0,14500566.0,803370.0,29616.0,844.0,33.0,3.0,2.0,0.0,37575235.0,58722661.0,35388781.0
HEX1,read_1,145.0,4927241.0,0.0,1769707.0,107785.0,2573.0,19.0,0.0,0.0,0.0,0.0,6807463.0,9778022.0,8800397.0
HEX1,read_2,256.0,4989809.0,0.0,1702753.0,111621.0,3001.0,29.0,1.0,0.0,0.0,0.0,6807463.0,9707573.0,8742333.0
HEX2,flashed,12871566.0,2794122.0,0.0,10864318.0,581279.0,21487.0,561.0,21.0,0.0,0.0,0.0,27133326.0,42803288.0,26355474.0
HEX2,read_1,90.0,3448744.0,0.0,1287875.0,81944.0,2012.0,8.0,0.0,0.0,0.0,0.0,4820668.0,6991710.0,6278414.0
HEX2,read_2,163.0,3486352.0,0.0,1244481.0,87354.0,2308.0,14.0,1.0,0.0,0.0,0.0,4820668.0,6962989.0,6246684.0
HEX3,flashed,16250257.0,3445543.0,1.0,12964304.0,663690.0,24122.0,648.0,31.0,3.0,2.0,3.0,33348570.0,52042157.0,31465209.0
HEX3,read_1,98.0,4295934.0,0.0,1607707.0,100150.0,2370.0,29.0,0.0,0.0,0.0,0.0,6006281.0,8701958.0,7821423.0
HEX3,read_2,227.0,4360269.0,0.0,1538212.0,104710.0,2842.0,27.0,1.0,0.0,0.0,0.0,6006281.0,8635094.0,7762332.0


In [73]:
df_melt = (df.reset_index()
           [['sample', 'read_type', 'total_slices', 'total_valid_slices']]
           .melt(id_vars=['sample', 'read_type'], value_name='count', var_name='stat'))

(alt.Chart(df_melt)
    .mark_bar()
    .encode(x='count',
            y='sample',
            color='stat'))

In [65]:
df_melt = (df.reset_index()
             .loc[:, 'sample':'9']
             .melt(id_vars=['sample', 'read_type'], var_name='frequency', value_name='count')
             .assign(frequency=lambda df: df['frequency'].astype(int),
                     count=lambda df: np.log(df['count'] + 1e-6))
             .loc[lambda df: df['count'] > 0])

(alt.Chart(df_melt)
    .mark_bar()
    .encode(x='frequency:N',
            y=alt.X('count', title='log10(count)'),
            color='read_type',
            row='sample',
            order=alt.Order('frequency', sort='ascending'))
)

# CCanalyser_statistics

In [100]:
df = pd.read_csv(directory + 'ccanalyser_stats.tsv', sep='\t', index_col=0)
split = (df.reset_index()
           ['index']
           .str.split('|', expand=True)
           .rename(columns={}))

df = pd.concat([df.reset_index(), split], axis=1, ignore_index=True)

In [101]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,HEX1|flashed|mapped,34661476,0,14635802,1023915,2170141,36,19049682,34368512,HEX1,flashed,mapped
1,HEX1|flashed|contains_single_capture,23137322,0,11358334,686728,2018581,36,11180769,22928851,HEX1,flashed,contains_single_capture
2,HEX1|flashed|contains_capture_and_reporter,18156225,0,8912346,0,0,35,8862621,18105530,HEX1,flashed,contains_capture_and_reporter
3,HEX1|flashed|duplicate_filtered,15223262,0,7452086,0,0,35,7449151,15177507,HEX1,flashed,duplicate_filtered
4,HEX1|pe|mapped,11823049,0,4304105,392516,743984,36,6015650,11411871,HEX1,pe,mapped
5,HEX1|pe|contains_single_capture,4115816,0,1925780,162079,509039,36,1461136,3944019,HEX1,pe,contains_single_capture
6,HEX1|pe|contains_capture_and_reporter,3012835,0,1462414,0,0,35,1107462,2978665,HEX1,pe,contains_capture_and_reporter
7,HEX1|pe|duplicate_filtered,2174291,0,1005379,0,0,35,1000984,2146306,HEX1,pe,duplicate_filtered
8,HEX2|flashed|mapped,25880776,0,11130124,783315,1611160,36,14173336,25666163,HEX2,flashed,mapped
9,HEX2|flashed|contains_single_capture,17759721,0,8716748,536729,1507106,36,8592667,17604170,HEX2,flashed,contains_single_capture


# Overall stats

In [83]:
df = pd.read_csv(directory + 'combined_stats.tsv', sep='\t', index_col=0)
(alt.Chart(df)
    .mark_bar()
    .encode(x='read_pairs',
            y=alt.Y('stat_type', sort=None),
            color='read_type',
            order=alt.Order('read_type', sort='ascending'),
            row='sample')
    .interactive()
)