## Purpose of creating this notebook
Since there are too many variables/columns in the dataset, I'd like to make an **interactive dashboard** helping us play with the data.

It is an ongoing work. Please leave feedback to help it improves.

**It is required to be running from your own notebook, so feel free to copy it if you're interested.**

It may take you ~3 mintues to load the dataset since it is huge.

Please upvote if you find it helpful.

## Showcasing how the board look like

1. Select the investment_id you're interested in
2. Select the variables(columns) you'd like to explore their assoication along the time id.
3. Click 'Start Button', and the chart will show up!
4. You can zoom a subset of data points by dragging an area over it

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.core.display import display, HTML
import ipywidgets as widgets
from IPython.display import display,clear_output
from ipywidgets import Output, Button
from ipywidgets import TwoByTwoLayout
# Utils widgets
from ipywidgets import Button, Layout, jslink, IntText, IntSlider, Box, VBox

from scipy.stats import pearsonr
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# change data type in order to make the loading faster

dtypes_dict = {
    'row_id': 'str',
    'time_id': 'uint16',
    'investment_id': 'uint16',
    'target': 'float32',
}
for i in range(300):
    dtypes_dict[f'f_{i}'] = 'float32'

# load data
df = pd.read_csv('../input/ubiquant-market-prediction/train.csv', 
                 usecols = dtypes_dict.keys(),
                 dtype = dtypes_dict)


In [None]:
def draw_scatter(df,invest_id,v1,v2,template,min_time,max_time):
    # select investment id
    dataset= df.loc[df['investment_id'] == invest_id]
    
    # select time period
    dataset = dataset.loc[(dataset['time_id'] >= min_time) & (dataset['time_id'] <= max_time)]
    
    # correlation value
    va = dataset[v1]
    vb = dataset[v2]
    corr, _ = pearsonr(va, vb)
    corr_v = 'Pearsons correlation: %.3f' % corr
    
    
    # mean / median and max of each feature
    va_mean = dataset[v1].mean()
    va_median = dataset[v1].median()
    va_max = dataset[v1].max()
    vb_mean = dataset[v2].mean()
    vb_median = dataset[v2].median()
    vb_max = dataset[v2].max()
    
    # set y axis range 200
    y_axis_range = 200
    
    # make subplots
    fig = make_subplots(
    rows=2, cols=2,
    specs=[[{"colspan": 2}, None],[{}, {}]],
    
    # title
    subplot_titles=('Feature values change over time  '+ "(" + str(v1) + " vs. " + str(v2) + ")",
                    str(v1), 
                    str(v2)))
    
    # scatter plot
    fig.add_trace(go.Scatter(x=list(dataset['time_id']), 
                                y=list(dataset[v1]), 
                                name = v1,
                                marker_color="#DEBAE6",
                                mode='markers'),
                                row = 1,
                                col = 1)

    fig.add_trace(go.Scatter(x=list(dataset['time_id']), 
                                y=list(dataset[v2]), 
                                name = v2,
                                marker_color="#C54DFD",
                                mode='markers'),
                                row = 1,
                                col = 1)
    # hist 1
    fig.add_trace(go.Histogram(x=dataset[v1], marker_color = "#DEBAE6", name = v1),
                 row = 2,
                 col = 1)
    
    # add mean
    fig.add_trace(go.Scatter(
                x=[va_mean * (1.2)],
                y=[y_axis_range*0.75],
                text=["mean"],
                name = "mean/" + str(v1),
                hoverinfo='skip',
                mode="text",
                textfont=dict(
                color="RoyalBlue"
                )
                ),
                 row = 2,
                 col = 1)
    
    # add mean text
    fig.add_shape(type="line", x0=va_mean, y0=0, x1=va_mean, y1=y_axis_range,
                  line=dict(color="RoyalBlue",width=2),
                 row = 2,
                 col = 1)
    
    # add median
    fig.add_trace(go.Scatter(
                x=[va_median * (1.2)],
                y=[y_axis_range*0.25],
                text=["median"],
                name = "median/" +str(v1),
                hoverinfo='skip',
                mode="text",
                textfont=dict(
                color="LightSeaGreen"
                )
                ),
                 row = 2,
                 col = 1)
    
    # add median text
    fig.add_shape(type="line", x0=va_median, y0=0, x1=va_median, y1=y_axis_range,
                  line=dict(color="LightSeaGreen",width=2),
                 row = 2,
                 col = 1)
    
    
    # hist 2
    fig.add_trace(go.Histogram(x=dataset[v2], marker_color="#C54DFD", name = v2),
                 row = 2,
                 col = 2)
    
    # add mean
    fig.add_trace(go.Scatter(
                x=[vb_mean * (1.2)],
                y=[y_axis_range*0.75],
                text=["mean"],
                name = "mean/" + str(v2),
                hoverinfo='skip',
                mode="text",
                textfont=dict(
                color="RoyalBlue"
                )
                ),
                 row = 2,
                 col = 2)
    
    # add mean text
    fig.add_shape(type="line", x0=vb_mean, y0=0, x1=vb_mean, y1=y_axis_range,
                  line=dict(color="RoyalBlue",width=2),      
                 row = 2,
                 col = 2)
    
    # add median
    fig.add_trace(go.Scatter(
                x=[vb_median * (1.2)],
                y=[y_axis_range*0.25],
                text=["median"],
                name = "median/" + str(v2),
                hoverinfo='skip',
                mode="text",
                textfont=dict(
                color="LightSeaGreen"
                )
                ),
                 row = 2,
                 col = 2)
    
    # add median text
    fig.add_shape(type="line", x0=vb_median, y0=0, x1=vb_median, y1=y_axis_range,
                  line=dict(color="LightSeaGreen",width=2),
                 row = 2,
                 col = 2)

    # add correlation text
    fig.add_annotation(text= corr_v,
                  xref="paper", yref="paper",
                  x=0.15, y=0.95, showarrow=False)
    
    # layout
    fig.update_layout(title= "investment_id: "+ str(invest_id), 
                  template = template,
                 hovermode="x unified")
    
    # x,y axis title of scatter
    fig.update_xaxes(
        title_text = "time id",
        title_font = {"size": 15})

    fig.update_yaxes(
        title_text = "value",
        title_font = {"size": 20},
        title_standoff = 25)
    
    # x,y axis title of hist
    fig['layout']['xaxis2']['title']='value'
    fig['layout']['xaxis3']['title']='value'
    fig['layout']['yaxis']['title']='value'
    fig['layout']['yaxis2']['title']='frequency'
    fig['layout']['yaxis3']['title']='frequency'

    fig.show()

# display buttons and interactive widgets
investment_ids = df['investment_id'].unique()
templates = ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]

# time period
max_time = df['time_id'].max()
min_time = df['time_id'].min()

features = []
for i in range(300):
    features.append('f_' + str(i))
features.append('target')


# create dropdown menus to get input
output = Output()
start = Button(description="Start")
start.style.button_color = 'lightblue'

v1_widget = widgets.Dropdown(
    options=list(features),
    value=list(list(features))[5],
    description='Variable 1',
    disabled=False,
)

v2_widget = widgets.Dropdown(
    options=list(features),
    value=list(list(features))[10],
    description='Variable 2',
    disabled=False,
)

template_widget = widgets.Dropdown(
    options=list(templates),
    value=list(templates)[2],
    description='Style',
    disabled=False,
)

invest_widget = widgets.Dropdown(
    options=sorted(investment_ids),
    value=list(investment_ids)[0],
    description='Invest_id',
    disabled=False,
)

time_widgets = widgets.IntRangeSlider(
    value=[150, 750],
    min=0,
    max=1200,
    step=10,
    description='Time period',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
)

def click_start(b):
    with output:
        clear_output()
        draw_scatter(df,
               invest_widget.value,
               v1_widget.value,
               v2_widget.value,
               template_widget.value,
               time_widgets.value[0],
               time_widgets.value[1])

start.on_click(click_start)

# once click the start button, the corresponding chart will be displayed
# layout of widgets
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='70%')
# rows containing widgets
item_a = [invest_widget,time_widgets]
item_b = [v1_widget,v2_widget]
item_d = [template_widget]
item_e = [start]

# box containing rows
box_1 = Box(children=item_a, layout=box_layout)
box_0 = Box(children=item_b, layout=box_layout)
box_2 = Box(children=item_d, layout=box_layout)
box_3 = Box(children=item_e, layout=box_layout)

vbox = VBox([box_1, box_0, box_2, box_3])

# display widgets and output
display(vbox,
        output)

## Showcasing how the board look like

1. Select the investment_id you're interested in
2. Select the variables(columns) you'd like to explore their assoication along the time id.
3. Click 'Start Button', and the chart will show up!
4. You can zoom a subset of data points by dragging an area over it
5. The example below shows the scatter plot of **target** and **f_100** from **investment_100** changing over time

In [None]:
draw_scatter(df = df,
             invest_id = 100,
             v1 = 'f_100',
             v2 = 'target',
             template = 'simple_white',
             min_time = 150,
             max_time = 750)