In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.transform import factor_cmap, factor_mark
import dask.dataframe as dd
from IPython.core.display import display, HTML
import ipywidgets as widgets
from IPython.display import display,clear_output
from ipywidgets import Output, Button
from bokeh.io import show, output_file

output_notebook()

# Introduction

This notebook aims at creating visualizations for initial comprehension of the train dataset.

# Load dataset and make a sample chart
Plot a scatter chart between **time_id** and **f_0**

In [None]:
# change data type in order to make the loading faster
dtypes_dict = {
    'row_id': 'str',
    'time_id': 'uint16',
    'investment_id': 'uint16',
    'target': 'float32',
}
for i in range(300):
    dtypes_dict[f'f_{i}'] = 'float32'

In [None]:
df = pd.read_csv('../input/ubiquant-market-prediction/train.csv', 
                 usecols = dtypes_dict.keys(),
                 dtype = dtypes_dict)

In [None]:
# get a smaller random sample (1 %) 
df_sample = df.sample(int(df.shape[0] * 0.01)) 

In [None]:
# draw a scatter plot
p = figure(width=400, height=400, title = "Scatter plot - time_id vs. f_0")

p.circle(df_sample['time_id'], 
         df_sample['f_0'], 
         size=2, 
         color="navy", 
         alpha=0.5)
p.xaxis.axis_label = "time_id"
p.yaxis.axis_label = "f_1"

# show the results
show(p)

# Select any two variables you're interested

So what about association between some other any two variables? **For now, you may need to fork the notebook and run it from your end to test it.**

It might take you a couple of minutes to load the data since it is huge.

In [None]:
investment_ids = df['investment_id'].unique()

# create a function drawing the scatter plot    
def draw_scatter(dataset,v1,v2,v3):
    df = dataset[[v1,v2,'investment_id']]
    df_invest = df.loc[df['investment_id'] == int(v3)] # select investment id
    p = figure(width=400, height=400)

    # add a circle renderer with a size, color, and alpha
    p.circle(df_invest[v1], df_invest[v2], size=2, color="navy", alpha=0.5)
    p.xaxis.axis_label = v1
    p.yaxis.axis_label = v2

    # show the results
    show(p)
    
# create two buttons to get input
output = Output()
start = Button(description="Start")
v1_widget = widgets.Dropdown(
    options=list(dtypes_dict),
    value=list(list(dtypes_dict))[5],
    description='Column:',
    disabled=False,
)

v2_widget = widgets.Dropdown(
    options=list(dtypes_dict),
    value=list(list(dtypes_dict))[10],
    description='Column:',
    disabled=False,
)

v3_widget = widgets.Dropdown(
    options=list(investment_ids),
    value=list(investment_ids)[0],
    description='Invest_id:',
    disabled=False,
)

def click_start(b):
    with output:
        clear_output()

        draw_scatter(df, 
                     v1_widget.value, 
                     v2_widget.value,
                     v3_widget.value
                    )

       
start.on_click(click_start)

# once click the start button, the corresponding article will be displayed
display(v1_widget,
        v2_widget, 
        v3_widget, 
        start, 
        output)

# Time id is not fully available for all the investment_id
As the data descrption of this competition states - 

> Not all investment have data in all time IDs.

So how about the distribution of number of time_id per investment_id then?

- The chart below indicates that the distribution is skewed to the right, but some investment_id only have a handful number of time_ids.

In [None]:
# select relevant columns
df_time_invest = pd.read_csv('../input/ubiquant-market-prediction/train.csv', 
                 usecols = ['time_id','investment_id'],
                 dtype = dtypes_dict)

# get number of time_ids per investment_id
df_time_invest_aggr = df_time_invest.groupby(['investment_id']).nunique().reset_index().sort_values(by='time_id', ascending = False)

#draw a histgram showing the distribution of number of time_ids grouped by investment_id
hist, edges = np.histogram(df_time_invest_aggr['time_id'], density=True, bins=50)

p = figure( width = 400, height = 400, title = "Distribution of number of time_ids by investment_id")
p.quad(top=hist, 
       bottom=0, 
       left=edges[:-1], 
       right=edges[1:], 
       line_color="pink"
      )

show(p)

# Correlation between target with any feature variable

Which feature variable has the stronges positive or negative association with target?

In [None]:
# get correlaation data
corr_target = df_sample.corr().sort_values(by='target')[['target']]
corr_target = corr_target.drop(['target','investment_id','time_id'])

# x, y axis
x = corr_target['target']
factors = list(corr_target.index)

# plot a chart showing correlation between each feature with target
p = figure(width=400, height=2680, y_range=factors, title = "Correlation between each feature variable and the target")

p.circle(x, factors, size=5, fill_color="pink", line_color="blue", line_width=1)

show(p)

# Mean and Median of each feature variable

From the chart below, we can see that the most of the means are close to 0, while median values vibrate all along the y axis, indicating variable names.

In [None]:
df_sample = df_sample.drop(['target','investment_id','time_id','row_id'],axis = 1)
df_sample_aggr = pd.DataFrame()
df_sample_aggr['mean'] = df_sample.mean() # get mean of each feature
df_sample_aggr['median'] = df_sample.median() # get median of each feature
df_sample_aggr = df_sample_aggr.sort_values(by = 'mean') # sort the variable by mean value

# Plot chart showing mean/median per feature
x1 = df_sample_aggr['mean']
x2 = df_sample_aggr['median']

factors = list(df_sample_aggr.index)

p = figure(width=400, height=2680, y_range=factors, title = "Mean and Median of each feature variable")

p.circle(x, factors, size=5, fill_color="pink", line_color="blue", line_width=1 , legend_label='mean')
p.circle(x2, factors, size=5, fill_color="blue", line_color="black", line_width=1, legend_label='median')
p.legend.location = "top_left"

show(p)

# More content is coming
Please leave your feedback or anything you hope me to visualize. I'll give it a try :)

Stay tuned!