This kernel uses two approaches:
 - [Pandas-Bokeh](https://github.com/PatrikHlobil/Pandas-Bokeh) - if possible
 - [Native Bokeh](https://bokeh.pydata.org/en/latest/docs/user_guide.html) - otherwise

## 1. Load libraries

In [None]:
#!pip install pandas-bokeh # must be turn on internet or install from settings

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from ipywidgets import (Checkbox, interactive, interactive_output, 
                        ToggleButton, IntSlider, VBox, HBox)
import ipywidgets as widgets

import pandas_bokeh
#pandas_bokeh.output_notebook()

from bokeh.models import (ColumnDataSource, HoverTool, Legend, 
                          LegendItem, LabelSet, Label, Panel, Tabs)
from bokeh.io import show, output_notebook, push_notebook
output_notebook()
from bokeh.plotting import figure
from bokeh.layouts import layout, grid, gridplot
from bokeh.transform import dodge
from bokeh.core.properties import value
from bokeh.models.widgets import Paragraph, Div

## 2. Load data

In [None]:
train = pd.read_csv('../input/train.csv', index_col='id')
# test = pd.read_csv('../input/test.csv', index_col='id')
structures = pd.read_csv('../input/structures.csv')
scalar_coupling_contributions = pd.read_csv('../input/scalar_coupling_contributions.csv')
potential_energy = pd.read_csv('../input/potential_energy.csv')
mulliken_charges = pd.read_csv('../input/mulliken_charges.csv')
magnetic_shielding_tensors = pd.read_csv('../input/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv('../input/dipole_moments.csv')

In [None]:
def scalar_coupling_contributions_merge(df):
    df = pd.merge(
        df, 
        scalar_coupling_contributions, 
        how = 'left',
        left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
        right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type']
    )
    
    return df

def structures_merge(df, atom_idx):
    df = pd.merge(
        df,
        structures,
        how = 'left',
        left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
        right_on = ['molecule_name',  'atom_index']
    )
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={
        'atom': f'struct_atom_{atom_idx}',
        'x': f'struct_x_{atom_idx}',
        'y': f'struct_y_{atom_idx}',
        'z': f'struct_z_{atom_idx}'
    })
    
    return df

def potential_energy_merge(df):
    df = pd.merge(
        df, 
        potential_energy, 
        how = 'left',
        left_on  = ['molecule_name'],
        right_on = ['molecule_name']
    )
    
    return df

def mulliken_charges_merge(df, atom_idx):
    df = pd.merge(
        df,
        mulliken_charges,
        how = 'left',
        left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
        right_on = ['molecule_name',  'atom_index']
    )
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'mulliken_charge': f'mulliken_charge_{atom_idx}'})
    
    return df

def magnetic_shielding_tensors_merge(df, atom_idx):
    df = pd.merge(
        df,
        magnetic_shielding_tensors,
        how = 'left',
        left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
        right_on = ['molecule_name',  'atom_index']
    )
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={
        'XX': f'mst_XX_{atom_idx}',
        'YX': f'mst_YX_{atom_idx}',
        'ZX': f'mst_ZX_{atom_idx}',
        'XY': f'mst_XY_{atom_idx}',
        'YY': f'mst_YY_{atom_idx}',
        'ZY': f'mst_ZY_{atom_idx}',
        'XZ': f'mst_XZ_{atom_idx}',
        'YZ': f'mst_YZ_{atom_idx}',
        'ZZ': f'mst_ZZ_{atom_idx}'
    })

    return df

def dipole_moments_merge(df):
    df = pd.merge(
        df, 
        dipole_moments, 
        how = 'left',
        left_on  = ['molecule_name'],
        right_on = ['molecule_name']
    )
    
    df = df.rename(columns={
        'X': 'dipole_X',
        'Y': 'dipole_Y',
        'Z': 'dipole_Z'
    })
        
    return df

train = scalar_coupling_contributions_merge(train)
train = structures_merge(train, 0)
train = structures_merge(train, 1)
train = potential_energy_merge(train)
train = mulliken_charges_merge(train, 0)
train = mulliken_charges_merge(train, 1)
train = magnetic_shielding_tensors_merge(train, 0)
train = magnetic_shielding_tensors_merge(train, 1)
train = dipole_moments_merge(train)

del (structures, scalar_coupling_contributions, potential_energy, 
     mulliken_charges, magnetic_shielding_tensors, dipole_moments)

train.head(5).T

- molecule_name – 85 003 unique values in train. Train and test contain different names of molecules.
- atom_index_0/1 – is the atom indices of the atom-pair creating the coupling and scalar_coupling_constant. 28 unique values.
- type - coupling type between two atoms in a molecule. 8 unique values.
- **scalar_coupling_constant** – effectively the magnetic interactions between magnetic interaction between two atoms in a molecule. Target variable.
- from tructures.scv (map by molecule_name, atom_index_0/1):
  - atom_0/1 – types of two atoms in a molecule, the interactions between which must be predicted. One molecule can contain more than two atoms, but it is only necessary to predict the interaction for the two listed in the train and test files. atom_0 - 1 unique value (H - hydrogen), atom_1 - 3 unique values (C - carbon, H - hydrogen, N).
  - x_0/1, y_0/1, z_0/1 - cartesian coordinates of each atom.
  
  
**--- The data below is for test only ---**


- from scalar_coupling_contributions.scv (map by molecule_name, atom_index_0, atom_index_1, type):
  - fc – high correlation with the target variable
  - sd
  - pso
  - dso
- from potential_energy_merge.scv (map by molecule_name):
  - potential_energy
- from mulliken_charges_merge.scv (map by molecule_name, atom_index_0/1):
  - mulliken_charge_0
  - mulliken_charge_1
- from magnetic_shielding_tensors_merge.scv (map by molecule_name, atom_index_0/1):
  - XX, YX, ZX
  - XY, YY, ZY
  - XZ, YZ, ZZ
- from dipole_moments_merge.scv (map by molecule_name, atom_index_0/1):
  - dipole_X
  - dipole_Y
  - dipole_Z

## 3.1. Distribution

In [None]:
colors = {'1JHC':'#1f77b4', 
          '2JHH':'#ff7f0e', 
          '1JHN':'#2ca02c', 
          '2JHN':'#d62728', 
          '2JHC':'#9467bd', 
          '3JHH':'#8c564b', 
          '3JHC':'#e377c2', 
          '3JHN':'#7f7f7f'}

def cor_plot(bins, types = train['type'].unique()):
    ds = pd.DataFrame()

    scc_min = train.query('type in @types')['scalar_coupling_constant'].min()
    scc_max = train.query('type in @types')['scalar_coupling_constant'].max()

    bottom = np.array([0] * bins)

    for i, cop_type in enumerate(types):
        data = train.query('type == @cop_type')['scalar_coupling_constant'].dropna()
        hist, edges = np.histogram(data, density=False, bins=bins, range=(scc_min, scc_max))

        top = bottom + np.array(hist)

        part_df = pd.DataFrame({
            'top':top,
            'bottom':bottom, 
            'left':edges[:-1], 
            'right':edges[1:],
            'color':colors[cop_type],
            'type':cop_type
        })

        bottom = top

        ds = ds.append(part_df)

    
    p = figure(
        #x_range=(scc_min, scc_max),
        x_axis_label='Scalar Coupling Constant', # y_axis_label='Distribution'
        title='"Scalar Coupling Constant" Distribution by "Type"',
        plot_width=805, plot_height=350
    ) 
   
    p.quad(
        source = ds, 
        top = 'top', bottom = 'bottom', 
        left = 'left', right = 'right',
        
        color = 'color', line_color = 'white', #hover_fill_color = 'color', 
        legend = 'type'        
    )
    
    p.xgrid.grid_line_color = None
    p.left[0].formatter.use_scientific = False
    #p.legend.click_policy="mute"
    
    return p

#del ui, out, tb, slider, p, plot

In [None]:
p = cor_plot(100)
show(p);

In [None]:
cor_tabs = []
for t in train['type'].unique():
    p = cor_plot(40, [t])
    cor_tabs.append(Panel(child=p, title=t))
show(Tabs(tabs=cor_tabs))

## 3.2. Count

In [None]:
atom_index_0_dist = train['atom_index_0'].value_counts().sort_index().values
atom_index_1_dist = train['atom_index_1'].value_counts().sort_index().values
atom_index_value_counts = pd.DataFrame({'atom_index_0':atom_index_0_dist, 'atom_index_1':atom_index_1_dist})
atom_index_value_counts.index.name = 'Atom Index Num'

aivc = atom_index_value_counts.plot_bokeh(
    kind='bar',
    figsize=(805, 350),
    title='Atom Index Count',
    xlabel='Atom Index Num', #ylabel='Count',
    disable_scientific_axes='y',
    zooming=False, panning=False,
    show_figure=False
);

groupby_ta0 = train[['atom_index_0','atom_index_1','type']].groupby(
    ['type','atom_index_0']).count().rename(columns = {'atom_index_1':'count'})
groupby_ta0 = groupby_ta0.reset_index().pivot(index='atom_index_0', columns='type', values='count')

groupby_ta0_plot = groupby_ta0.plot_bokeh(
    kind='line',
    figsize=(805, 350),
    title='"Atom Index 0" Count by Type',
    xlabel='Atom Index 0 Num',
    disable_scientific_axes='y',
    zooming=False, panning=False,
    colormap='Paired',
    show_figure=False
);

groupby_ta1 = train[['atom_index_0','atom_index_1','type']].groupby(
    ['type','atom_index_1']).count().rename(columns = {'atom_index_0':'count'})
groupby_ta1 = groupby_ta1.reset_index().pivot(index='atom_index_1', columns='type', values='count')

groupby_ta1_plot = groupby_ta1.plot_bokeh(
    kind='line',
    figsize=(805, 350),
    title='"Atom Index 1" Count by Type',
    xlabel='Atom Index 1 Num',
    disable_scientific_axes='y',
    zooming=False, panning=False,
    colormap='Paired',
    show_figure=False
);

pandas_bokeh.plot_grid([
    [aivc],
    [groupby_ta0_plot], 
    [groupby_ta1_plot]
])

del groupby_ta0, groupby_ta1, groupby_ta0_plot, groupby_ta1_plot
del atom_index_0_dist, atom_index_1_dist, atom_index_value_counts

In [None]:
df_group = train[['struct_atom_1','type','scalar_coupling_constant']]
df_group = pd.pivot_table(df_group, values='scalar_coupling_constant', index=['type'], columns=['struct_atom_1'], aggfunc='count')
df_group = df_group.reset_index().rename_axis(None, axis=1).set_index('type', drop=True)
df_group = df_group.fillna(0).astype(int)

df_group.plot_bokeh(
    kind='bar',
    figsize=(805, 350),
    title='"Type" Count by "Atom 1"',
    xlabel='Type', ylabel='',
    disable_scientific_axes='y',
    legend='top_right',
    stacked=True,
    zooming=False, panning=False
);

del df_group

In [None]:
df_group = train[['struct_atom_1','type','scalar_coupling_constant']]
df_group = pd.pivot_table(df_group, values='scalar_coupling_constant', index=['struct_atom_1'], columns=['type'], aggfunc='count')
df_group = df_group.reset_index().rename_axis(None, axis=1)
df_group = df_group.fillna(0)

df_group.plot_bokeh(
    kind='bar',
    x='struct_atom_1',
    figsize=(805, 350),
    title='"Atom 1" Count by "Type"',
    xlabel='Atom 1', ylabel='',
    disable_scientific_axes='y',
    legend = "top_right",
    stacked=True,
    zooming=False, panning=False
);

del df_group

## 3.4. Scatter Plots

In [None]:
t_aa_scc = train[['atom_index_0','atom_index_1','type','scalar_coupling_constant']]
scc_min = abs(t_aa_scc['scalar_coupling_constant'].min())
t_aa_scc['scalar_coupling_constant'] = t_aa_scc['scalar_coupling_constant'] + scc_min + 1
t_aa_scc = t_aa_scc.sample(10_000)

t_aa_scc.plot_bokeh(
    kind='scatter',
    
    x='atom_index_0',
    y='atom_index_1',
    category='type',
    size='scalar_coupling_constant',
    
    xlim = (-3, 30), ylim = (-7, 30),
    figsize=(805, 780),
    title='Correlation between atom_index_0, atom_index_1 and scalar_coupling_constant (circle size) by type (ver.1)',
    xlabel='atom_index_0', ylabel='atom_index_1',
    hovertool=False,
    alpha=0.5,
    line_width=0.2,
    zooming=False, panning=False,
    disable_scientific_axes='y'
);

del t_aa_scc

In [None]:
t_aa_list = {}
colors = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2','#7f7f7f']

for i, cop_type in enumerate(train['type'].unique()):
    t_aa_df = train.query('type == @cop_type').sample(10_000)[['atom_index_0','atom_index_1']]
    t_aa_df.rename(columns = {'atom_index_1':cop_type}, inplace=True)
    
    if i == 0: 
        title='Atom Index corellation by type'
    else:
        title=''
        
    t_aa_plot = t_aa_df.plot_bokeh(
        kind='point',
        
        x='atom_index_0',
        xticks=range(0, 29, 5),
        xlim=(-1, 28), ylim=(-1, 28),
        ylabel='atom_index_1',
        title=title,
        hovertool=False,
        color = colors[i],
        zooming=False, panning=False,
        show_figure=False,
        alpha=0.01
    );
    
    t_aa_list[cop_type] = t_aa_plot

pandas_bokeh.plot_grid([
    [t_aa_list['1JHC'], t_aa_list['1JHN'], t_aa_list['2JHH']],
    [t_aa_list['2JHN'], t_aa_list['2JHC'], t_aa_list['3JHH']],
    [t_aa_list['3JHC'], t_aa_list['3JHN']]
], plot_width=268, plot_height=268);

del t_aa_list, t_aa_df, t_aa_plot

In [None]:
t_aa_list = {}
colors = [
    ['#1f77b4','#b45c1f'],
    ['#ff7f0e','#0e8eff'],
    ['#2ca02c','#a02ca0'],
    ['#d62728','#27d6d5'],
    ['#9467bd','#90bd67'],
    ['#8c564b','#4b818c'],
    ['#e377c2','#77e398'],
    ['#7f7f7f','#de2020']
]

for i, cop_type in enumerate(train['type'].unique()):
    t_aa_df = train.query('type == @cop_type').sample(5_000)[['atom_index_0','atom_index_1','scalar_coupling_constant']]
    
    x_min = t_aa_df['scalar_coupling_constant'].min().astype(int) - 2
    x_max = t_aa_df['scalar_coupling_constant'].max().astype(int) + 2

    Y_min = t_aa_df[['atom_index_0','atom_index_1']].stack().min().astype(int) - 2
    Y_max = t_aa_df[['atom_index_0','atom_index_1']].stack().max().astype(int) + 2
    
    t_aa_plot = t_aa_df.plot_bokeh(
        kind='point',
        
        x='scalar_coupling_constant',
        xticks=range(x_min, x_max, 25),
        xlim=(x_min, x_max), ylim=(Y_min, Y_max),
        ylabel='atom_index_0/1',
        title=cop_type,
        hovertool=False,
        colormap=colors[i],
        zooming=False, panning=False,
        show_figure=False,
        alpha=0.1
    );
    
    t_aa_list[cop_type] = t_aa_plot


d = Div(text="""<b>Correlation between atom_index_0, atom_index_1 and scalar_coupling_constant by type (ver.2)</b>""", width=805, height=15)

grid = pandas_bokeh.plot_grid([
    [t_aa_list['1JHC'], t_aa_list['1JHN']], 
    [t_aa_list['2JHH'], t_aa_list['2JHN']],
    [t_aa_list['2JHC'], t_aa_list['3JHH']],
    [t_aa_list['3JHC'], t_aa_list['3JHN']]
], plot_width=402, plot_height=402, show_plot=False);

l = layout([
    [d],
    [grid]
])

show(l)
    
del t_aa_list, t_aa_df, t_aa_plot

In [None]:
t_aa_list = {}
colors = [
    ['#1f77b4','#b45c1f'],
    ['#ff7f0e','#0e8eff'],
    ['#2ca02c','#a02ca0'],
    ['#d62728','#27d6d5'],
    ['#9467bd','#90bd67'],
    ['#8c564b','#4b818c'],
    ['#e377c2','#77e398'],
    ['#7f7f7f','#de2020']
]

for i, cop_type in enumerate(train['type'].unique()):
    t_aa_df = train.query('type == @cop_type').sample(5_000)[['mulliken_charge_0','mulliken_charge_1','scalar_coupling_constant']]
    
    x_min = t_aa_df['scalar_coupling_constant'].min().astype(int) - 1
    x_max = t_aa_df['scalar_coupling_constant'].max().astype(int) + 1

    y_min = t_aa_df[['mulliken_charge_0','mulliken_charge_1']].stack().min().astype(int) - 1
    y_max = t_aa_df[['mulliken_charge_0','mulliken_charge_1']].stack().max().astype(int) + 1

    t_aa_plot = t_aa_df.plot_bokeh(
        kind='point',
        x='scalar_coupling_constant',
        xticks=range(x_min, x_max, 25),
        xlim=(x_min, x_max), ylim=(y_min, y_max),
        ylabel='mulliken_charge_0/1',
        title=cop_type,
        hovertool=False,
        colormap=colors[i],
        zooming=False, panning=False,
        show_figure=False,
        alpha=0.1
    );
    
    t_aa_list[cop_type] = t_aa_plot


d = Div(text="""<b>Correlation between mulliken_charge_0, mulliken_charge_1 and scalar_coupling_constant by type</b>""", width=805, height=15)

grid = pandas_bokeh.plot_grid([
    [t_aa_list['1JHC'], t_aa_list['1JHN']], 
    [t_aa_list['2JHH'], t_aa_list['2JHN']],
    [t_aa_list['2JHC'], t_aa_list['3JHH']],
    [t_aa_list['3JHC'], t_aa_list['3JHN']]
], plot_width=402, plot_height=402, show_plot=False);

l = layout([
    [d],
    [grid]
])

show(l)
    
del t_aa_list, t_aa_df, t_aa_plot

In [None]:
def cor_plot(field, types=train['type'].unique()):
    colors = {
        '1JHC':'#1f77b4',
        '2JHH':'#ff7f0e', 
        '1JHN':'#2ca02c', 
        '2JHN':'#d62728', 
        '2JHC':'#9467bd', 
        '3JHH':'#8c564b', 
        '3JHC':'#e377c2', 
        '3JHN':'#7f7f7f'
    }

    p1 = figure(
        x_axis_label=field,
        y_axis_label='scalar_coupling_constant',
        title=f'Corellation between "{field}" and "scalar_coupling_constant" by "type"',
        plot_width=805, plot_height=805
    )

    for i, t in enumerate(types):
        t_fc_df = train.query('type == @t').sample(3_000)[[field,'scalar_coupling_constant']]

        p1.circle(
            source = ColumnDataSource(t_fc_df), 
            x = field, 
            y = 'scalar_coupling_constant', 
            legend=t,
            size = 5,
            color = colors[t], 
            alpha = 0.1
        )

    p1.legend.location = 'top_left'
    p1.legend.click_policy = 'hide'

    return p1

In [None]:
show(cor_plot('fc'))

In [None]:
cor_tabs = []

for t in train['type'].unique():
    p = cor_plot('fc', [t])
    cor_tabs.append(Panel(child=p, title=t))
    
show(Tabs(tabs=cor_tabs))

Native Bokeh Cons:
- No native 3D chart (custom in Kaggle does not work)
- To build a histogram, you must manually calculate it.
- No separate color management for the legend and chart.
- Some Bokeh widgets in the Kaggle notebooks do not work.
- There are a lot of problems with drawing a large number of bokeh's plots in kaggle notebooks.