In [1]:
import sys
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

In [2]:
if "/home/alex/qi3/hmmtuf" not in sys.path:
    sys.path.append("/home/alex/qi3/hmmtuf")

In [3]:
from db.sqlite3_db_connector import SQLiteDBConnector
from compute_engine.src.utils import INFO

In [4]:
# connect to the DB
db_connector = SQLiteDBConnector(db_file="../../play_ground.sqlite3")

In [5]:
# check the table names
tbl_names = db_connector.get_table_names()

for name in tbl_names:
    print("{0} DB table: {1}".format(INFO, name))

INFO: DB table: distance_metric_type
INFO: DB table: distance_sequence_type
INFO: DB table: repeats
INFO: DB table: hmm_state_types
INFO: DB table: repeats_distances
INFO: DB table: repeats_info
INFO: DB table: gquads_info


In [6]:
sql = "SELECT * from distance_sequence_type"
rows = db_connector.fetch_all(sql=sql)

for item in rows:
    print("{0} Sequence type {1}".format(INFO, item))

INFO: Sequence type (1, 'NORMAL')
INFO: Sequence type (2, 'PURINE')
INFO: Sequence type (3, 'AMINO')
INFO: Sequence type (4, 'WEAK_HYDROGEN')


In [7]:
uique_seq_types = [item for item in rows]

In [8]:
sql = "SELECT * from distance_metric_type"
rows = db_connector.fetch_all(sql=sql)

for item in rows:
    print("{0} Metric type {1}".format(INFO, item))

INFO: Metric type (1, 'Hamming', 'ham')
INFO: Metric type (2, 'MLIPNS', 'mlipns')
INFO: Metric type (3, 'Levenshtein', 'lev')
INFO: Metric type (4, 'DamerauLevenshtein', 'damlev')
INFO: Metric type (5, 'JaroWinkler', 'jwink')
INFO: Metric type (6, 'StrCmp95', 'str')
INFO: Metric type (7, 'NeedlemanWunsch', 'nw')
INFO: Metric type (8, 'SmithWaterman', 'sw')
INFO: Metric type (9, 'Gotoh', 'got')
INFO: Metric type (10, 'Jaccard', 'jac')
INFO: Metric type (11, 'Sorensen', 'sor')
INFO: Metric type (12, 'Tversky', 'tve')
INFO: Metric type (13, 'Overlap', 'ov')
INFO: Metric type (14, 'Tanimoto', 'tan')
INFO: Metric type (15, 'Cosine', 'cos')
INFO: Metric type (16, 'MongeElkan', 'mon')
INFO: Metric type (17, 'Bag', 'bag')
INFO: Metric type (18, 'LCSSeq', 'lcsseq')
INFO: Metric type (19, 'LCSStr', 'lcsstr')
INFO: Metric type (20, 'RatcliffObershelp', 'rat')
INFO: Metric type (21, 'ArithNCD', 'ari')
INFO: Metric type (22, 'RLENCD', 'rle')
INFO: Metric type (23, 'BWTRLENCD', 'bwt')
INFO: Metric t

In [9]:
uique_dist_types = [item for item in rows]

In [10]:
sql = "SELECT * from hmm_state_types"
rows = db_connector.fetch_all(sql=sql)

for item in rows:
    print("{0} State type {1}".format(INFO, item))

INFO: State type (1, 'NORMAL')
INFO: State type (2, 'TUF')
INFO: State type (3, 'DELETION')
INFO: State type (4, 'DUPLICATION')


In [11]:
unique_state_types = [item for item in rows]
unique_state_types.remove((4, 'DUPLICATION'))
for i, item in enumerate(unique_state_types):
    if item == (3, 'DELETION'):
        unique_state_types[i] = (3, 'CORE')

for item in unique_state_types:
    print("{0} State type {1}".format(INFO, item))

INFO: State type (1, 'NORMAL')
INFO: State type (2, 'TUF')
INFO: State type (3, 'CORE')


In [12]:
# the application
app = JupyterDash(__name__)

In [13]:
print(uique_seq_types)

[(1, 'NORMAL'), (2, 'PURINE'), (3, 'AMINO'), (4, 'WEAK_HYDROGEN')]


In [14]:
app.layout = html.Div([
    html.H1("Distances Plot"),
    html.H3("Sequence type"),
    dcc.Dropdown(
        id="dropdown-sequence",
        options=[{"label": x[1], "value": x[0]} for x in uique_seq_types],
        value=uique_seq_types[0][0],
        clearable=False,
    ),
    
    html.H3("Distance type"),
    dcc.Dropdown(
        id="dropdown-distance",
        options=[{"label": x[1], "value": x[0]} for x in uique_dist_types],
        value=uique_dist_types[0][0],
        clearable=False,
    ),
   
    html.H3("Normal state"),
    html.Div(children=[html.H5("Number of sequences"), html.Div(id="normal-n-distances")]),
    dcc.Graph(id="normal-bar-chart"),
    html.H3("TUF state"),
    html.Div(children=[html.H5("Number of sequences"), html.Div(id="tuf-n-distances")]),
    dcc.Graph(id="tuf-bar-chart"),
    html.H3("Core"),
    html.Div(children=[html.H5("Number of sequences"), html.Div(id="core-n-distances")] ),
    dcc.Graph(id="core-bar-chart"),
    
])

In [15]:
def create_figure_plot(state_type_id, metric_type_id, sequence_type_id):
    
    sql = "SELECT value FROM repeats_distances WHERE \
         hmm_state_id_1 = {0} AND  hmm_state_id_2 = {0} \
        AND metric_type_id={1} AND sequence_type_id={2}".format(state_type_id,
                                                                metric_type_id,
                                                                sequence_type_id)

    print("{0} Executing sql={1}".format(INFO, sql))
    rows = db_connector.fetch_all(sql=sql)
    print("{0} Fetched number of rows={1}".format(INFO, len(rows)))

    counts, bins = np.histogram(rows, bins=35)
    bins = 0.5 * (bins[:-1] + bins[1:])

    fig = px.bar(x=bins, y=counts, orientation='v', labels={'x':'distance', 'y':'count'}, range_x=[0,1])
    fig.update_layout(xaxis = dict(
            tickmode = 'linear',
            tick0 = 0.0,
            dtick = 0.15))

    return fig, len(rows)
    
    

In [16]:
@app.callback(
    Output("normal-bar-chart", "figure"),
    Output("normal-n-distances", component_property='children'),
    Output("tuf-bar-chart", "figure"),
    Output("tuf-n-distances", component_property='children'),
    Output("core-bar-chart", "figure"),
    Output("core-n-distances", component_property='children'),
    [Input("dropdown-sequence", "value"), 
     Input("dropdown-distance", "value")])
def update_bar_chart(seq_type, distance_type):
    
    metric_type_id = distance_type           
    sequence_type_id = seq_type
    
    figs_ids = [1, 2, 3]
    figs = []
    for fid in figs_ids:
        fig, rows = create_figure_plot(state_type_id=fid, 
                                 metric_type_id=metric_type_id,
                                 sequence_type_id=sequence_type_id)
        figs.append(fig)
        figs.append(rows)
    
    
    return figs[0], figs[1], figs[2], figs[3], figs[4], figs[5],

In [17]:
# Run app and display result inline in the notebook
app.run_server(mode='external', use_reloader=False)

Dash app running on http://127.0.0.1:8050/
INFO: Executing sql=SELECT value FROM repeats_distances WHERE          hmm_state_id_1 = 1 AND  hmm_state_id_2 = 1         AND metric_type_id=1 AND sequence_type_id=1
INFO: Fetched number of rows=189544
INFO: Executing sql=SELECT value FROM repeats_distances WHERE          hmm_state_id_1 = 2 AND  hmm_state_id_2 = 2         AND metric_type_id=1 AND sequence_type_id=1
INFO: Fetched number of rows=683427
INFO: Executing sql=SELECT value FROM repeats_distances WHERE          hmm_state_id_1 = 3 AND  hmm_state_id_2 = 3         AND metric_type_id=1 AND sequence_type_id=1
INFO: Fetched number of rows=318229
INFO: Executing sql=SELECT value FROM repeats_distances WHERE          hmm_state_id_1 = 1 AND  hmm_state_id_2 = 1         AND metric_type_id=1 AND sequence_type_id=2
INFO: Fetched number of rows=189544
INFO: Executing sql=SELECT value FROM repeats_distances WHERE          hmm_state_id_1 = 2 AND  hmm_state_id_2 = 2         AND metric_type_id=1 AND se