# CHAMPS Dataset Scalar Coupling

- Michael Follari
- [Predicting Molecular Properties](https://www.kaggle.com/c/champs-scalar-coupling)
- UNCG Physics 2020
- Dr. Ajay Covell

In [17]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

More plot styling can be found [here.](https://matplotlib.org/tutorials/introductory/customizing.html)

In [2]:
plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = [16,9]
plt.rcParams.update({'font.size': 16})

# Data Sets
* structures.csv - `structures_df` - Contains the xyz coordinates of each atom within each molecule
* train.csv - `train_df` - Contains the type and scalar_coupling_constant between every atoms pair within each molecule.

### The following code imports both the above datasets, merges, and adds additional columns.
* Only added column is currently displacement between the two atoms in the bond.

In [3]:
structure_path = 'D:\data\champs\zip\structures.zip'
train_path = 'D:\data\champs\zip\\train.zip'
test_path = 'D:\data\champs\zip\\test.zip'
train_bond_path = 'D:\data\champs\zip\\train_bond.gz'
test_bond_path = 'D:\data\champs\zip\\test_bond.gz'

In [4]:
# Load in Structure and Train CSV files, merge, calculate values, and save to Molecules CSV
def merge_struct_dataset(path_struct, path_train, path_merged):
    
    # load in struct and train datasets
    structures_df = pd.read_csv(path_struct)
    train_df = pd.read_csv(path_train)

    # Merge structure data onto train_df for each atom (atom_index_0 and atom_index_1). Hold in mol_df
    mol_df = train_df.merge(structures_df, left_on=['molecule_name','atom_index_0'], right_on=['molecule_name','atom_index'])
    mol_df = mol_df.merge(structures_df, left_on=['molecule_name','atom_index_1'], right_on=['molecule_name','atom_index'])

    # drop extra columns from merge and rename
    mol_df.drop(['atom_index_x','atom_index_y'], axis=1, inplace=True)
    mol_df.rename(columns={'atom_x':'atom_0','atom_y':'atom_1','x_x':'x_0','y_x':'y_0','z_x':'z_0','x_y':'x_1','y_y':'y_1','z_y':'z_1'}, inplace=True)
    
    # Append new columns with calculated values
    mol_df = append_molecule_calculations(mol_df)
    
    # Save to CSV
    mol_df.to_csv(path_merged, compression="gzip")
    
# adds new columns with calculated values to molecule df
def append_molecule_calculations(mol_df):
    mol_df = calculate_molecule_displacement(mol_df)
    return mol_df
    
# calculates the displacement for each atom interaction 
def calculate_molecule_displacement(mol_df):
    mol_df['displacement'] = mol_df.apply(lambda row: calc_disp(row), axis=1)
    return mol_df
    
# calculcates displacement on a passed row
def calc_disp(row):
    return np.linalg.norm(np.array([row['x_1']-row['x_0'],row['y_1']-row['y_0'],row['z_1']-row['z_0']]))

### Merge strucutre data with train and test data sets.
* Only need to do once to generaete and save to dataframes as CSV files, then load as normla CSV

In [5]:
# merge_struct_dataset(structure_path, train_path, train_bond_path)
# merge_struct_dataset(structure_path, test_path, test_bond_path)

#### Load merged datasets

In [6]:
bond_df = pd.read_csv( train_bond_path )

In [None]:
# bond_df

In [8]:
# test_bond_df = pd.read_csv( test_bond_path )
# test_bond_df.head()

In [10]:
# test_df = pd.read_csv( test_path )
# struct_df = pd.read_csv( structure_path )

# Exploration
* Simple Regression of Scalar Coupling & Atom Displacement.

In [121]:
def score_type( y_test, y_calc ):
    y = list(zip(y_test, y_calc))
    error = sum( [abs(i - j) for i, j in y] )
    return np.log10( error / len(y) )

def score_total(sum_score, num_type):
    return sum_score / num_type

def score(types_data):
    
    total_score = 0
    for typ in types_data:
        total_score += score_type(typ)
    return total_score / len(types_data)

### The different kinds of bonds present in the data

In [12]:
bond_types = bond_df.type.unique()
print(bond_types)

['1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN']


#### Fetching and train/test splitting of data for regressions

In [100]:
def get_xy(df, bond_type):
    df = df[ df.type == bond_type].sort_values(by=['displacement'])
    # Split and return into x and y arrays
    x = df['displacement'].values
    y = df['scalar_coupling_constant'].values
    return [x,y]

def get_xy_split(df, bond_type):
    x, y = get_xy(df, bond_type)
    return train_test_split( x, y, random_state=42)

#### Plotting data points and models

In [106]:
def plot_model(x_test, y_test, y_regs, title, x_bound=[None,None], y_bound=[None,None]):
    
    xmin = x_test.min() if x_bound[0] is None else x_bound[0]
    xmax = x_test.max() if x_bound[1] is None else x_bound[1]
    ymin = y_test.min() if y_bound[0] is None else y_bound[0]
    ymax = y_test.max() if y_bound[1] is None else y_bound[1]
    
    plt.scatter(x_test,y_test, c='b')
    
    for y_reg in y_regs:
        plt.plot(x_test, y_reg, linewidth=3, linestyle='solid')
    
    plt.xlim([xmin * 0.99,xmax * 1.01])
    plt.ylim([ymin * 0.99,ymax* 1.01])
    plt.title(title, size=22)
    plt.xlabel('Displacement', size=22)
    plt.ylabel('Scalar Coupling Constant', size=22)
    plt.show()

# Linear Regression
- [scikit Learn Linear Regression](https://scikit-learn.org/stable/modules/linear_model.html)

In [114]:
def linear_regression(x_train, y_train, x_test):
    # Train linear regression model
    linear_reg = train_linear_regression(x_train, y_train)
    
    # Use linear regression to generate y value
    return np.array(line_data_points(x_test,linear_reg.coef_, linear_reg.intercept_)).T[0]
    
def train_linear_regression(x,y):
    # Linear Regression fit
    reg = linear_model.LinearRegression()
    reg.fit( x.reshape(-1,1), y )
    return reg

def line_data_points(x_array, coef, intercept):
    return [line_func(x, coef, intercept) for x in x_array]

def line_func(x, coef, intercept):
    return x*coef + intercept

# Random Forest Regression
* [scikit Learn Random Forest Regression](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

In [109]:
def random_forest(x_train, y_train, x_test):
    # Train Random Forest regression model
    random_forest_reg = train_rand_forest(x_train, y_train)
    
    # Use Random Forest regression to generate y values
    return random_forest_reg.predict(x_test.reshape(-1,1))

def train_rand_forest(x, y):
    # Train Random Forest
    reg = RandomForestRegressor(max_depth=2, random_state=0)
    reg.fit(x.reshape(-1,1), y)
    return reg

### Trains models for data

In [119]:
def evaluate_models(plot=False):

    title = "Regression of Scalar-Coupling-Constant vs. Bond Displacement : "

    sum_score = 0;
    for bond_type in bond_types:

        # Get x y test/train. Displacement and Bond strength
        x_train, x_test, y_train, y_test = get_xy_split(bond_df, bond_type )
        x_test.sort() # Sort training values (mostly for plotting)

        # Train and predict Linear regression
        y_linear_regression = linear_regression(x_train, y_train, x_test)

        # Train and predict Random Forest
        y_random_forest_regression = random_forest(x_train, y_train, x_test)

        # Calculate score for regression on type
        linear_score = score_type( y_test, y_linear_regression) 
        forest_score = score_type( y_test, y_random_forest_regression) 

        # Print result
        print(bond_type + ' has score :     Linear: ', linear_score, '     Forest: ', forest_score)

        # Plot only if desired
        if plot:
            y_regs = [y_linear_regression, y_random_forest_regression]
            plot_model(x_test, y_test, y_regs, title+bond_type,[x_train.min(),x_train.max()], [y_train.min(),y_train.max()])

In [120]:
evaluate_models()

1JHC has score :     Linear:  1.1931855488967258      Forest:  1.1817036573495254
2JHH has score :     Linear:  0.4867483028763783      Forest:  0.5064864649994729
1JHN has score :     Linear:  1.0693895782065483      Forest:  1.0797471051012908
2JHN has score :     Linear:  0.4783139288899586      Forest:  0.49426840518159426
2JHC has score :     Linear:  0.4691411105228811      Forest:  0.4859395946249071
3JHH has score :     Linear:  0.4970802957301517      Forest:  0.5386676362207091
3JHC has score :     Linear:  0.4126741785216499      Forest:  0.42200057386748496
3JHN has score :     Linear:  0.0053978364621969765      Forest:  0.018640509332309395


<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

<small><small><small>This holds space so I can scroll further</small></small></small>