# Plot gallery

## How to use this notebook

We use the test dataset stored [here](https://drive.google.com/file/d/1CTwrjO2dPWqISxcEyCJ1oj_EE1IaDrmI/view?usp=share_link). 

You shall store the data under the `data` folder, as follow:

```
/main folder
    /data
        sample1.json
        sample2.json
    gallery.ipynb

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

import os, json, sys
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd

import draw


data = draw.load_dataset()

study = draw.Study()
study.df = data
sample, reference, section, family = study.df.iloc[0][['sample', 'reference', 'section', 'family']]

path_figs = '/Users/casper/Local/HMS/Code/draw/plots/plots_figs'
# remove all html files in path_figs
for file in os.listdir(path_figs):
    if file.endswith('.html'):
        os.remove(os.path.join(path_figs, file))
dim = (600, 400)
print("finished")


In [None]:
data = study.get_df(
        sample = '65degrees_1_S20_L001',        # select one or multiple sample(s)
        reference = ['3042-O-flank_1=hp1-DB',   # select one or multiple reference(s)
                        '3043-CC-flank_1=hp1-DB'],
        section = 'ROI',                        # select one or multiple section(s)
        base_type = ['A','C']                   # select one or multiple base type(s)
    )[['sample','reference','section','sequence','sub_rate','deltaG','family','num_aligned','DMS_conc_mM']].reset_index(drop=True)

## Mutation fraction

In [None]:
%autoreload 2
fig = study.mutation_fraction(
    sample = sample,
    reference = reference,
    section='ROI'
)['fig']
fig.show()

In [None]:
%reload_ext autoreload
fig = study.mutation_fraction_identity(
    sample = sample,
    reference = reference,
    section='ROI'
)['fig']
fig.show()

## Mutation fraction delta

In [None]:
fig = study.mutation_fraction_delta(
    sample = ['65degrees_1_S20_L001','5degrees_2_S9_L001'],
    reference =  '3042-O-flank_1=hp1-DB',  # select one or multiple reference(s)             
    section='full'
)['fig'].show()


## DeltaG vs mutation fraction

In [None]:
fig = study.deltaG_vs_sub_rate(sample=sample, section='ROI', family=family, base_type=['A','C'])['fig']
fig.show()
fig.write_html(os.path.join(path_figs, 'deltaG_vs_sub_rate.html'))

## Aligned reads per reference

In [None]:
%reload_ext autoreload

fig = study.num_aligned_reads_per_reference_frequency_distribution(
    sample = sample,
    section = 'full'
)['fig']

fig.show()
fig.write_html(os.path.join(path_figs, 'num_aligned_reads_per_reference_frequency_distribution.html'))

## Mutations per read per sample

In [None]:
fig = study.mutations_per_read_per_sample(
    sample = sample,
)['fig']

fig.show()

fig.write_html(os.path.join(path_figs, 'mutations_per_read_per_sample.html'))

In [None]:
study.experimental_variable_across_samples(
    experimental_variable = 'temperature_k',
    reference = reference,
    section = 'ROI',
    base_type = ['A','C'],
    base_pairing = False
)['fig'].show()
study.experimental_variable_across_samples(
    experimental_variable = 'temperature_k',
    reference = reference,
    section = 'ROI',
)['data']

In [None]:
study.compare_mutation_profiles(
    sample = ['10degrees_2_S11_L001','65degrees_1_S20_L001','45degrees_2_S19_L001','10degrees_1_S10_L001'],
    reference = reference,
    section = 'full',
)['fig'].show()

In [None]:
study.df[['DMS_conc_mM','temperature_k','buffer','cell_line','exp_env','inc_time_tot_secs','sample']].drop_duplicates()

In [None]:
study.df.columns

In [None]:
import tqdm
for _ in tqdm.tqdm(range(10)):
    data = study.get_df(
        base_index = range(20,40),
       # base_type = 'AC'
    )

for _ in tqdm.tqdm(range(10)):
    data = study.get_df(
       # base_index = range(20,40),
        base_type = 'AC'
    )
  

for _ in tqdm.tqdm(range(10)):
    data = study.get_df(
        base_index = range(20,40),
        base_type = 'AC'
    )


In [None]:
import numpy as np
def __find_base_in_sequenceNumpy(sequence, base_type):
    """Find the index of a base in a sequence
    
    Example:
    sequence = 'ACACGATCGATCGATCACGATCAGGCATGCTACG'
    base_type = 'AC'
    >>> array([ 0,  1,  2,  3,  5,  7,  9, 11, 13, 15, 16, 17, 19, 21, 22, 25, 26,
       29, 31, 32])
    """
    sequence_arr = np.array(list(sequence))
    return np.where(np.isin(sequence_arr, list(base_type)))[0]

def __find_base_in_sequence(sequence, base_type):
    return [i for i, base in enumerate(sequence) if base in base_type]

for _ in tqdm.tqdm(range(100000)):
    __find_base_in_sequence('ACACGATCGATCGATCACGATCAGGCATGCTACG','AC')
    
for _ in tqdm.tqdm(range(100000)):
    __find_base_in_sequenceNumpy('ACACGATCGATCGATCACGATCAGGCATGCTACG','AC')