In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import gc
import psutil
from ase.io import read
import ase
import os
from heapq import nsmallest
%matplotlib inline
print(os.listdir("../../inputs"))
print('ase version', ase.__version__)

def show_ram_usage():
    py = psutil.Process(os.getpid())
    print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))
show_ram_usage()

['test.csv', 'train.csv', 'potential_energy.csv', 'magnetic_shielding_tensors.csv', 'scalar_coupling_contributions.csv', 'sample_submission.csv', 'mulliken_charges.csv', 'structures.csv', 'dipole_moments.csv', 'structures']
ase version 3.17.0
RAM usage: 0.12063217163085938 GB


In [2]:
%%time
train_dtypes = {
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}
# Load the data
train = pd.read_csv('../../inputs/train.csv', index_col='id', dtype=train_dtypes)
# ---------------------------------------------------------------
show_ram_usage()

# Let's work with a single dataframe 1JHN to begin with

def build_type_dataframe(base, coupling_type):
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    #base = base.drop(['atom_index_0', 'atom_index_1', 'molecule_name'], axis=1)
    return base

df_1JHN = build_type_dataframe(train, '1JHN')
print(df_1JHN.shape)
print(display(df_1JHN.head()))

del train
gc.collect()
# ---------------------------------------------------------------
# Extract mol_name, [atom_index0, atom_index1] as a list
def extract_base(base):
    mol_name = base['molecule_name'].to_numpy()
    atom_index0 = base['atom_index_0'].to_numpy()
    atom_index1 = base['atom_index_1'].to_numpy()
    return mol_name, atom_index0, atom_index1

mol_train, atom_index0_train, atom_index1_train = extract_base(df_1JHN)

#-----------------------------------------------------------------
print(f'There are {df_1JHN.shape[0]} rows in train data.')
print('')
print(f'There are {mol_train.shape[0]} rows in mol_train data.')
print('')
print(f"There are {df_1JHN['molecule_name'].nunique()} distinct molecules in train data.")
print(f"There are {df_1JHN['atom_index_0'].nunique()} unique atoms.")
print('')
print(f'There are {atom_index0_train.shape[0]} rows in atom_index0_train data.')
print(f'There are {atom_index1_train.shape[0]} rows in atom_index1_train data.')
print('')
# ---------------------------------------------------------------
show_ram_usage()

  mask |= (ar1 == a)


RAM usage: 0.28217315673828125 GB
(43363, 4)


Unnamed: 0_level_0,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,dsgdb9nsd_000002,1,0,32.6889
13,dsgdb9nsd_000002,2,0,32.689098
15,dsgdb9nsd_000002,3,0,32.690498
97,dsgdb9nsd_000012,3,0,55.5252
101,dsgdb9nsd_000012,4,0,54.735901


None
There are 43363 rows in train data.

There are 43363 rows in mol_train data.

There are 28560 distinct molecules in train data.
There are 24 unique atoms.

There are 43363 rows in atom_index0_train data.
There are 43363 rows in atom_index1_train data.

RAM usage: 0.211090087890625 GB
CPU times: user 2.59 s, sys: 264 ms, total: 2.85 s
Wall time: 2.52 s


In [4]:
%%time
# Define functions
# Convert  to mol object using ase
def convert_xyz_to_mol(mol_id):
    mol = read(f'../../inputs/structures/{mol_id}.xyz')
    #natom = len(mol)
    #atoms = mol.get_atomic_numbers()
    return mol

# Distance r_b with atom indices
def get_rb(mol, atom_index0, atom_index1):
    r_b = mol.get_distance(atom_index0, atom_index1)
    return r_b

# Acquire r_b, r_i, theta_i for the molecule
def get_ri(mol, rb, atom_index0, atom_index1):
    r_i = []
    angles = []
    #theta_i = []
    for i in range((len(mol))):
        if i == atom_index0:
            #theta_i.append(0)
            r_i.append(0)
            continue
        if i == atom_index1:
            #theta_i.append(0)
            r_i.append(0)
            continue
        angle1 = mol.get_angle(atom_index0, i, atom_index1)
        angle2 = mol.get_angle(atom_index0, atom_index1, i)
        angle3 = mol.get_angle(atom_index1, atom_index0, i)
        angles.append([angle1, angle2, angle3])
        # Get theta from center of bond
        theta = 180 - angle3 - (angle1/2.0)
        #theta_i.append(theta)
        # Get distance from center of bond
        r = (rb/2.0)*(np.sin(np.radians(angle3)))/(np.sin(np.radians(angle1/2.0)))
        r_i.append(r) 
    return r_i

def get_theta_i(mol, r_i, atom_index0, atom_index1):
    #angles = []
    theta_i = []
    for i in range((len(mol))):
        if i == atom_index0:
            theta_i.append(0)
            continue
        if i == atom_index1:
            theta_i.append(0)
            continue
        angle1 = mol.get_angle(atom_index0, i, atom_index1)
        #angle2 = mol.get_angle(atom_index0, atom_index1, i)
        angle3 = mol.get_angle(atom_index1, atom_index0, i)
        #angles.append([angle1, angle2, angle3])
        # Get theta from center of bond
        theta = 180 - angle3 - (angle1/2.0)
        theta_i.append(theta) 
    return theta_i

def get_G0(r_b, eta, mu):
    G0_list = []
    for val2 in mu:
        for val1 in eta:
            G0 = np.exp(-val1*(r_b-val2)**2)
            G0_list.append(G0)
    return G0_list

# Define the cut-off function
def fcut(Ri):
    if Ri <= Rc:
        y = 0.5*(np.cos(np.pi*Ri/Rc)+1.0)
    else:
        y = 0
    return y

# Let's Define the radial function G1_atomic

def get_G1(mol, rb, index_0, index_1):
    natom = len(mol)
    atoms = mol.get_atomic_numbers()
    r_i = get_ri(mol, rb, index_0, index_1)
    G1_par = np.zeros(shape=(np1, 5)) # H C N O F 
    for i in range(natom):
        if i == index_0:
            continue
        if i == index_1:
            continue
        for j in range(np1):
            eta_m = p1[j][0]
            mu_m =  p1[j][1]
            G1_temp = np.exp(-eta_m*(r_i[i]-mu_m)**2)*fcut(r_i[i])
            if atoms[i] == 1:
                G1_par[j][0] = G1_par[j][0] + G1_temp
                continue
            if atoms[i] == 6:
                G1_par[j][1] = G1_par[j][1] + G1_temp
                continue
            if atoms[i] == 7:
                G1_par[j][2] = G1_par[j][2] + G1_temp
                continue
            if atoms[i] == 8:
                G1_par[j][3] = G1_par[j][3] + G1_temp
                continue
            if atoms[i] == 9:
                G1_par[j][4] = G1_par[j][4] + G1_temp
    return G1_par

def get_G1_G2(mol, rb, index_0, index_1):
    natom = len(mol)
    atoms = mol.get_atomic_numbers()
    r_i = get_ri(mol, rb, index_0, index_1)    
    theta_i = get_theta_i(mol, r_i, index_0, index_1)
    G1_par = np.zeros(shape=(np1, 5)) # H C N O F 
    G2_pos = np.zeros(shape=(np1, 5)) # H C N O F
    G2_neg = np.zeros(shape=(np1, 5)) # H C N O F
    G2_pos1 = np.zeros(shape=(np1, 5)) # H C N O F
    G2_neg1 = np.zeros(shape=(np1, 5)) # H C N O F
    for i in range(natom):
        if i == index_0:
            continue
        if i == index_1:
            continue
        for j in range(np1):
            eta_m = p1[j][0]
            mu_m =  p1[j][1]
            G1_temp = np.exp(-eta_m*(r_i[i]-mu_m)**2)*fcut(r_i[i])
            G2pos_temp = ((1 + abs(np.cos(np.radians(theta_i[i]))))**1.0)*G1_temp
            G2neg_temp = ((1 + abs(np.cos(np.radians(theta_i[i]))))**4.0)*G1_temp
            G2pos1_temp = ((1 + abs(np.cos(np.radians(theta_i[i]))))**8.0)*G1_temp
            G2neg1_temp = ((1 + abs(np.cos(np.radians(theta_i[i]))))**16.0)*G1_temp
            if atoms[i] == 1:
                G1_par[j][0] = G1_par[j][0] + G1_temp
                G2_pos[j][0] = G2_pos[j][0] + G2pos_temp
                G2_neg[j][0] = G2_neg[j][0] + G2neg_temp
                G2_pos1[j][0] = G2_pos1[j][0] + G2pos_temp
                G2_neg1[j][0] = G2_neg1[j][0] + G2neg_temp
                continue
            if atoms[i] == 6:
                G1_par[j][1] = G1_par[j][1] + G1_temp
                G2_pos[j][1] = G2_pos[j][1] + G2pos_temp
                G2_neg[j][1] = G2_neg[j][1] + G2neg_temp
                G2_pos1[j][1] = G2_pos1[j][1] + G2pos_temp
                G2_neg1[j][1] = G2_neg1[j][1] + G2neg_temp
                continue
            if atoms[i] == 7:
                G1_par[j][2] = G1_par[j][2] + G1_temp
                G2_pos[j][2] = G2_pos[j][2] + G2pos_temp
                G2_neg[j][2] = G2_neg[j][2] + G2neg_temp
                G2_pos1[j][2] = G2_pos1[j][2] + G2pos_temp
                G2_neg1[j][2] = G2_neg1[j][2] + G2neg_temp
                continue
            if atoms[i] == 8:
                G1_par[j][3] = G1_par[j][3] + G1_temp
                G2_pos[j][3] = G2_pos[j][3] + G2pos_temp
                G2_neg[j][3] = G2_neg[j][3] + G2neg_temp
                G2_pos1[j][3] = G2_pos1[j][3] + G2pos_temp
                G2_neg1[j][3] = G2_neg1[j][3] + G2neg_temp
                continue
            if atoms[i] == 9:
                G1_par[j][4] = G1_par[j][4] + G1_temp
                G2_pos[j][4] = G2_pos[j][4] + G2pos_temp
                G2_neg[j][4] = G2_neg[j][4] + G2neg_temp
                G2_pos1[j][4] = G2_pos1[j][4] + G2pos_temp
                G2_neg1[j][4] = G2_neg1[j][4] + G2neg_temp
    return G1_par, G2_pos, G2_neg, G2_pos1, G2_neg1,  r_i

                

### Define Globals
Rc = 6.0

# Define eta, mu
#eta0 = [10, 1000, 2500, 5000, 7500, 10000, 50000, 100000] 
#mu0 = [1.00]
#eta0 = [10, 50, 100, 250, 500] # 5 values defined
#mu0 = [0.97]
eta0 = [1000] # 5 values defined
mu0 = [1.0,   1.04,  1.08, 1.12, 1.15]
np0 = len(eta0) * len(mu0)


#eta1 = [0.001, 0.01, 0.1, 0.5, 1.0, 4.0, 40.0] 
#mu1 = [0.0, 1.0]
eta1 = [6.3775510]
mu1 = np.arange(0.0, 5.6, 0.25)
np1 = len(eta1) * len(mu1)
p1 =  [[i,j] for i in eta1 for j in mu1]

zeta = 1.0

eta2 = [0.1, 0.5]
mu2 = [0.0]
np2 = len(eta2) * len(mu2)
p2 =  [[i,j] for i in eta2 for j in mu2]

    
    
rb_train = []
ri_list = []
G0_train_list = []
G1_train_list = []
G1a_train_list = []
G2pos_train_list = []
G2neg_train_list = []
G2pos1_train_list = []
G2neg1_train_list = []
for i, m in enumerate(mol_train):
    mol = convert_xyz_to_mol(m)
    rb = get_rb(mol, atom_index0_train[i], atom_index1_train[i])
    rb_train.append(rb)
    G0 = get_G0(rb, eta0, mu0)
    G0_train_list.append(G0)
    G1 = get_G1(mol, rb, atom_index0_train[i], atom_index1_train[i])
    G1a, G2_pos, G2_neg, G2_pos1, G2_neg1, ri = get_G1_G2(mol, rb, atom_index0_train[i], atom_index1_train[i])
    G1_train_list.append(G1)
    G1a_train_list.append(G1a)
    G2pos_train_list.append(G2_pos)
    G2neg_train_list.append(G2_neg)
    G2pos1_train_list.append(G2_pos1)
    G2neg1_train_list.append(G2_neg1)
    ri_list.append(ri)
    
G0_train = np.array(G0_train_list)
G1_array = np.array(G1_train_list)
G1a_array = np.array(G1a_train_list)
G2pos_array = np.array(G2pos_train_list)
G2neg_array = np.array(G2neg_train_list)
G2pos1_array = np.array(G2pos1_train_list)
G2neg1_array = np.array(G2neg1_train_list)

G1_train = np.zeros(shape=(len(G1_array), np1*5))
G1a_train = np.zeros(shape=(len(G1a_array), np1*5))
G2pos_train = np.zeros(shape=(len(G2pos_array), np1*5))
G2neg_train = np.zeros(shape=(len(G2neg_array), np1*5))
G2pos1_train = np.zeros(shape=(len(G2pos1_array), np1*5))
G2neg1_train = np.zeros(shape=(len(G2neg1_array), np1*5))
for i, row1 in enumerate(G1_array):
    G1_train[i] = row1.flatten()
for i, row1 in enumerate(G1a_array):
    G1a_train[i] = row1.flatten()
for i, row1 in enumerate(G2pos_array):
    G2pos_train[i] = row1.flatten()
for i, row1 in enumerate(G2neg_array):
    G2neg_train[i] = row1.flatten()
for i, row1 in enumerate(G2pos1_array):
    G2pos1_train[i] = row1.flatten()
for i, row1 in enumerate(G2neg1_array):
    G2neg1_train[i] = row1.flatten()
ri_0 = []
ri_1 = []
for i, val in enumerate(ri_list):
    ri_0_temp = nsmallest(4, val)[2]
    ri_1_temp = nsmallest(4, val)[3]
    ri_0.append(ri_0_temp)
    ri_1.append(ri_1_temp)
# ---------------------------------------------------------------
print('Length of rb :')
print(len(rb_train))
print('Length of G0 :')
print(len(G0_train))
print(len(G0_train[0])) # Should be (eta0 x mu0)
print(len(G0_train[1]))
print('Length of G1 :')
print(len(G1_train))
print(len(G1_train[0])) # Should be  (eta1 x mu1)
print(len(G1_train[1]))
print('Length of G2pos :')
print(len(G2pos_train))
print(len(G2pos_train[0])) # Should be  (eta1 x mu1)
print(len(G2pos_train[1]))
print('Length of G2neg :')
print(len(G2neg_train))
print(len(G2neg_train[0])) # Should be  (eta1 x mu1)
print(len(G2neg_train[1]))
# ---------------------------------------------------------------
show_ram_usage()

Length of rb :
43363
Length of G0 :
43363
5
5
Length of G1 :
43363
115
115
Length of G2pos :
43363
115
115
Length of G2neg :
43363
115
115
RAM usage: 0.9079475402832031 GB
CPU times: user 9min 16s, sys: 760 ms, total: 9min 17s
Wall time: 9min 17s


In [5]:
%%time
# Build complete feature set on original data frame
df_1JHN['rb'] = rb_train
print('rb added!')
print('')
df_1JHN['ri0'] = ri_0
df_1JHN['ri1'] = ri_1
print('Lowest two ri added!')
print('')
for i in range(np0):
    df_1JHN[f'g0_{i}'] = G0_train[:, i]

print('G0 added!')
print('')
for i in range(np1*5):
    df_1JHN[f'g1_{i}'] = G1_train[:, i]
    df_1JHN[f'g1a_{i}'] = G1a_train[:, i]
    df_1JHN[f'g2p_{i}'] = G2pos_train[:, i]
    df_1JHN[f'g2n_{i}'] = G2neg_train[:, i]
    df_1JHN[f'g2p1_{i}'] = G2pos1_train[:, i]
    df_1JHN[f'g2n1_{i}'] = G2neg1_train[:, i]
    #df_1JHN = df_1JHN.drop(['atom_index_0', 'atom_index_1', 'molecule_name', 'type'], axis=1)
print('G1, G2 added!')
print('')
print(df_1JHN.shape)
print(display(df_1JHN.head(10)))

del G0_train_list, G1_train_list, G1_train, G1_array, G1a_train_list, G1a_train, G1a_array, ri_0, ri_1, ri_list
del G2pos_train_list, G2pos_train, G2pos_array, G2neg_train_list, G2neg_train, G2neg_array
del G2pos1_train_list, G2pos1_train, G2pos1_array, G2neg1_train_list, G2neg1_train, G2neg1_array
# ---------------------------------------------------------------
show_ram_usage()

rb added!

Lowest two ri added!

G0 added!

G1, G2 added!

(43363, 587)


Unnamed: 0_level_0,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant,rb,ri0,ri1,g0_0,g0_1,g0_2,...,g1_113,g2p_113,g2n_113,g2p1_113,g2n1_113,g1_114,g2p_114,g2n_114,g2p1_114,g2n1_114
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,dsgdb9nsd_000002,1,0,32.6889,1.01719,0.963805,0.963848,0.744162,0.594345,0.019349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,dsgdb9nsd_000002,2,0,32.689098,1.017187,0.963808,0.963848,0.744234,0.594268,0.019343,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,dsgdb9nsd_000002,3,0,32.690498,1.017208,0.963826,0.963829,0.743704,0.59483,0.019393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,dsgdb9nsd_000012,3,0,55.5252,1.007511,0.96955,1.32663,0.945144,0.34801,0.005223,...,1.964827e-30,2.479053e-30,4.9793249999999995e-30,2.479053e-30,4.9793249999999995e-30,0.0,0.0,0.0,0.0,0.0
101,dsgdb9nsd_000012,4,0,54.735901,1.004933,0.972237,1.329141,0.975955,0.292391,0.003571,...,9.469145999999999e-30,1.785087e-29,1.195929e-28,1.785087e-29,1.195929e-28,0.0,0.0,0.0,0.0,0.0
225,dsgdb9nsd_000019,7,2,54.063999,1.004771,0.970822,1.339172,0.977495,0.28907,0.003485,...,4.873246e-30,9.229622e-30,6.270191e-29,9.229622e-30,6.270191e-29,0.0,0.0,0.0,0.0,0.0
229,dsgdb9nsd_000019,8,2,56.186001,1.006952,0.968546,1.334675,0.952823,0.335481,0.004815,...,9.005393e-31,1.101953e-30,2.01904e-30,1.101953e-30,2.01904e-30,0.0,0.0,0.0,0.0,0.0
389,dsgdb9nsd_000032,4,0,37.719002,1.014969,0.964092,1.419738,0.79925,0.534441,0.014567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
395,dsgdb9nsd_000032,5,0,38.349499,1.015277,0.963764,1.420755,0.791858,0.542673,0.01516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
696,dsgdb9nsd_000050,5,0,59.300999,1.004448,1.344792,1.344793,0.980413,0.28253,0.003319,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


None
RAM usage: 0.7217140197753906 GB
CPU times: user 3.79 s, sys: 728 ms, total: 4.52 s
Wall time: 1.49 s


In [6]:
%%time
# Use lightGBM to train
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

X_data = df_1JHN.drop(['scalar_coupling_constant', 'atom_index_0', 'atom_index_1', 'molecule_name'], axis=1)
y_data = df_1JHN['scalar_coupling_constant']

X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=128)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
# ---------------------------------------------------------------
show_ram_usage()

from lightgbm import LGBMRegressor
LGB_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,
    'num_leaves': 128,
    'min_child_samples': 79,
    'max_depth': 9,
    'subsample_freq': 1,
    'subsample': 0.9,
    'bagging_seed': 11,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0
}


model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
model.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
        verbose=100, early_stopping_rounds=200)
print('')
print('Model fit!')
print('')
# ---------------------------------------------------------------
show_ram_usage()

y_pred = model.predict(X_val)
print(np.log(mean_absolute_error(y_val, y_pred)))

RAM usage: 1.0694427490234375 GB
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 0.341053	valid_1's l1: 0.480543
[200]	training's l1: 0.261194	valid_1's l1: 0.443936
[300]	training's l1: 0.212033	valid_1's l1: 0.426706
[400]	training's l1: 0.177842	valid_1's l1: 0.416441
[500]	training's l1: 0.151489	valid_1's l1: 0.4089
[600]	training's l1: 0.130607	valid_1's l1: 0.404184
[700]	training's l1: 0.111588	valid_1's l1: 0.399734
[800]	training's l1: 0.0962543	valid_1's l1: 0.397215
[900]	training's l1: 0.0836028	valid_1's l1: 0.39496
[1000]	training's l1: 0.0727595	valid_1's l1: 0.393186
[1100]	training's l1: 0.0640915	valid_1's l1: 0.391767
[1200]	training's l1: 0.0555163	valid_1's l1: 0.390442
[1300]	training's l1: 0.0480524	valid_1's l1: 0.389442
[1400]	training's l1: 0.0422654	valid_1's l1: 0.388754
[1500]	training's l1: 0.0369733	valid_1's l1: 0.388066
Did not meet early stopping. Best iteration is:
[1500]	training's l1: 0.0369733	valid_1's l1: 0.38

In [None]:
%%time
import seaborn as sns
cols = list(X_data.columns)
feat_imp = model.feature_importances_
print(feat_imp[0:5])
df_importance = pd.DataFrame({'feature': cols, 'importance': feat_imp})
df_importance = df_importance.sort_values('importance', ascending=False)[:20]
sns.barplot(x="importance", y="feature", data=df_importance)

In [None]:
print(len(rb_train))
print(rb_train[0:5])

In [None]:
%%time
eta0 = [1000] # 5 values defined
mu0 = [1.0,   1.04,  1.08, 1.12, 1.15]
np0 = len(eta0) * len(mu0)

G0_train_list = []
for i, m in enumerate(rb_train):
    G0 = get_G0(m, eta0, mu0)
    G0_train_list.append(G0)

G0_train = np.array(G0_train_list)

for i in range(np0):
    df_1JHN[f'g0_{i}'] = G0_train[:, i]
print('G0 added!')
print('')
# ---------------------------------------------------------------
show_ram_usage()

In [None]:
%%time
# Use lightGBM to train
X_data = df_1JHN.drop(['scalar_coupling_constant', 'atom_index_0', 'atom_index_1', 'molecule_name'], axis=1)
y_data = df_1JHN['scalar_coupling_constant']

X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=128)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
# ---------------------------------------------------------------
show_ram_usage()

model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
model.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
        verbose=100, early_stopping_rounds=200)
print('')
print('Model fit!')
print('')
# ---------------------------------------------------------------
show_ram_usage()

y_pred = model.predict(X_val)
print(np.log(mean_absolute_error(y_val, y_pred)))

cols = list(X_data.columns)
feat_imp = model.feature_importances_
print(feat_imp[0:5])
df_importance = pd.DataFrame({'feature': cols, 'importance': feat_imp})
df_importance = df_importance.sort_values('importance', ascending=False)[:20]
sns.barplot(x="importance", y="feature", data=df_importance)

In [None]:
%%time

Rc = 6.0
eta1 = [6.3775510]
mu1 = np.arange(0.5, 7.6, 0.75)
np1 = len(eta1) * len(mu1)
p1 =  [[i,j] for i in eta1 for j in mu1]

G1_train_list = []
for i, m in enumerate(mol_train):
    mol = convert_xyz_to_mol(m)
    rb = rb_train[i]
    G1, ri =  get_G1(mol, rb, atom_index0_train[i], atom_index1_train[i])
    G1_train_list.append(G1)
    
G1_array = np.array(G1_train_list)

G1_train = np.zeros(shape=(len(G1_array), np1*5))
for i, row1 in enumerate(G1_array):
    G1_train[i] = row1.flatten()
for i in range(np1*5):
    df_1JHN[f'g1_{i}'] = G1_train[:, i]
print('G1 added!')
print('')
# ---------------------------------------------------------------
show_ram_usage()

In [None]:
%%time
# Use lightGBM to train
X_data = df_1JHN.drop(['scalar_coupling_constant', 'atom_index_0', 'atom_index_1', 'molecule_name'], axis=1)
y_data = df_1JHN['scalar_coupling_constant']

X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=128)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
# ---------------------------------------------------------------
show_ram_usage()

model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
model.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
        verbose=100, early_stopping_rounds=200)
print('')
print('Model fit!')
print('')
# ---------------------------------------------------------------
show_ram_usage()

y_pred = model.predict(X_val)
print(np.log(mean_absolute_error(y_val, y_pred)))

cols = list(X_data.columns)
feat_imp = model.feature_importances_
print(feat_imp[0:5])
df_importance = pd.DataFrame({'feature': cols, 'importance': feat_imp})
df_importance = df_importance.sort_values('importance', ascending=False)[:20]
sns.barplot(x="importance", y="feature", data=df_importance)

In [None]:
%%time
# Let's plot r_b vs G_0 values
ax = plt.gca()
for i in range(np0):
    df_1JHN.plot(kind='scatter',x='rb',y=f'g0_{i}', s = 0.5, ax=ax)
    plt.title('')
    plt.xlabel('rb')
    plt.ylabel('G0')
plt.show()
plt.figure()
ax1 = plt.gca()
for i in range(np0):
    df_1JHN.plot(kind='scatter',x='rb',y=f'g0_{i}', s = 0.5, ax=ax1)
    plt.title('')
    plt.xlim(1.00,1.06)
    plt.xlabel('rb')
    plt.ylabel('G0')
plt.show()



In [None]:
# Create data
x, y = y_val, y_pred
# Plot
plt.scatter(x, y)
plt.title('')
plt.xlabel('y_val')
plt.ylabel('y_pred')
plt.show()