# CHAMPS Dataset Scalar Coupling

- Michael Follari
- [Predicting Molecular Properties](https://www.kaggle.com/c/champs-scalar-coupling)
- UNCG Physics 2020
- Dr. Ajay Covell

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

More plot styling can be found [here.](https://matplotlib.org/tutorials/introductory/customizing.html)

In [3]:
plt.style.use('classic')

# Data Sets
* structures.csv - `structures_df` - Contains the xyz coordinates of each atom within each molecule
* train.csv - `train_df` - Contains the type and scalar_coupling_constant between every atoms pair within each molecule.

In [4]:
structures_df = pd.read_csv('D:\data\champs\zip\structures.zip')

In [100]:
train_df = pd.read_csv('D:\data\champs\zip\\train.zip')

In [10]:
structures_df

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.002150,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397
...,...,...,...,...,...,...
2358870,dsgdb9nsd_133885,11,H,-1.454004,-0.967309,1.459246
2358871,dsgdb9nsd_133885,12,H,0.277779,-2.697872,0.195770
2358872,dsgdb9nsd_133885,13,H,2.515854,-1.151784,0.527369
2358873,dsgdb9nsd_133885,14,H,0.013699,1.199431,-1.680192


In [15]:
train_df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807600
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257000
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.254800
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.254300
4,4,dsgdb9nsd_000001,2,0,1JHC,84.807400
...,...,...,...,...,...,...
4659071,4659071,dsgdb9nsd_133884,17,4,2JHC,3.543450
4659072,4659072,dsgdb9nsd_133884,17,5,3JHC,0.568997
4659073,4659073,dsgdb9nsd_133884,17,6,3JHC,1.173370
4659074,4659074,dsgdb9nsd_133884,17,7,2JHC,4.762010


### Merging Structure data into train_df

In [102]:
# Get df of each atom within the molecule. Get specific atom location by (mol_name, index).
struct_atom_0 = structures_df[['molecule_name','atom_index','atom','x','y','z']].rename(columns={'atom':'atom_0','atom_index':'atom_index_0'})
# Repeat for index_1. (x,y,z) not needed, as they are added in with previouis df. This is to get 'atom_1' merged as well.
struct_atom_1 = structures_df[['molecule_name','atom_index','atom']].rename(columns={'atom':'atom_1','atom_index':'atom_index_1'})

# Merge structure information into train_df.
mol_df = train_df.merge(struct_atom_0, left_on=['molecule_name','atom_index_0'], right_on=['molecule_name','atom_index_0'])
mol_df = mol_df.merge(struct_atom_1, left_on=['molecule_name','atom_index_1'], right_on=['molecule_name','atom_index_1'])

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x,y,z,atom_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807600,H,0.002150,-0.006031,0.001976,C
1,4,dsgdb9nsd_000001,2,0,1JHC,84.807400,H,1.011731,1.463751,0.000277,C
2,7,dsgdb9nsd_000001,3,0,1JHC,84.809300,H,-0.540815,1.447527,-0.876644,C
3,9,dsgdb9nsd_000001,4,0,1JHC,84.809500,H,-0.523814,1.437933,0.906397,C
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257000,H,0.002150,-0.006031,0.001976,H
...,...,...,...,...,...,...,...,...,...,...,...
4659071,4659040,dsgdb9nsd_133884,13,17,3JHH,3.062820,H,2.582409,-1.260138,0.293499,H
4659072,4659067,dsgdb9nsd_133884,16,17,3JHH,0.789559,H,-0.084531,1.110807,-1.796741,H
4659073,4659038,dsgdb9nsd_133884,13,14,3JHH,1.005250,H,2.582409,-1.260138,0.293499,H
4659074,4659039,dsgdb9nsd_133884,13,15,3JHH,1.005260,H,2.582409,-1.260138,0.293499,H


In [103]:
mol_df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x,y,z,atom_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807600,H,0.002150,-0.006031,0.001976,C
1,4,dsgdb9nsd_000001,2,0,1JHC,84.807400,H,1.011731,1.463751,0.000277,C
2,7,dsgdb9nsd_000001,3,0,1JHC,84.809300,H,-0.540815,1.447527,-0.876644,C
3,9,dsgdb9nsd_000001,4,0,1JHC,84.809500,H,-0.523814,1.437933,0.906397,C
4,1,dsgdb9nsd_000001,1,2,2JHH,-11.257000,H,0.002150,-0.006031,0.001976,H
...,...,...,...,...,...,...,...,...,...,...,...
4659071,4659040,dsgdb9nsd_133884,13,17,3JHH,3.062820,H,2.582409,-1.260138,0.293499,H
4659072,4659067,dsgdb9nsd_133884,16,17,3JHH,0.789559,H,-0.084531,1.110807,-1.796741,H
4659073,4659038,dsgdb9nsd_133884,13,14,3JHH,1.005250,H,2.582409,-1.260138,0.293499,H
4659074,4659039,dsgdb9nsd_133884,13,15,3JHH,1.005260,H,2.582409,-1.260138,0.293499,H


# Exploration 1
* Linear Regression of Bond Type & Magnitude of Bond Distance.

### The different kinds of bonds present in the data

In [8]:
print( train_df.type.unique() )

['1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN']


In [43]:
twojhh_df = train_df[ champs_df.type == '2JHH']

In [45]:
twojhh_df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257000
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.254800
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.254300
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.254100
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.254800
...,...,...,...,...,...,...
4658880,4658880,dsgdb9nsd_133882,9,10,2JHH,-7.875290
4658937,4658937,dsgdb9nsd_133883,9,10,2JHH,-0.753917
4658989,4658989,dsgdb9nsd_133883,15,16,2JHH,-8.667180
4659004,4659004,dsgdb9nsd_133884,9,10,2JHH,-0.833881


In [47]:
twojhh_df[['atom_index_0', 'scalar_coupling_constant']]

Unnamed: 0,atom_index_0,scalar_coupling_constant
1,1,-11.257000
2,1,-11.254800
3,1,-11.254300
5,2,-11.254100
6,2,-11.254800
...,...,...
4658880,9,-7.875290
4658937,9,-0.753917
4658989,15,-8.667180
4659004,9,-0.833881


# Linear Regression
- [Scikit Learn Linear Regression](https://scikit-learn.org/stable/modules/linear_model.html)