In [None]:
import numpy as np
import pandas as pd
import os
from numba import jit

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import dask.dataframe as dd

from tqdm import tqdm_notebook

from itertools import product

from IPython.display import HTML
import json

import glob
import scipy
import altair as alt

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
    


In [None]:
def metric(df, preds):
    df["prediction"] = preds
    maes = []
    for t in df.type.unique():
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(metrics.mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    return np.mean(maes)

def give_mass(atom_val):
    
    if atom_val == "H":
        mass = mass_dict["H"]
    elif atom_val == "C":
        mass = mass_dict["C"]
    elif atom_val == "F":
        mass = mass_dict["F"]
    elif atom_val =="O":
        mass = mass_dict["O"]
    elif atom_val =="N":
        mass= mass_dict["N"]
    return mass


def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df


def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_inv_dist_mean'] = df.groupby('molecule_name')['inv_dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_inv_dist_min'] = df.groupby('molecule_name')['inv_dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['molecule_inv_dist_max'] = df.groupby('molecule_name')['inv_dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    
    
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    
    
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_inv_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['inv_dist'].transform('mean')

    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_inv_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['inv_dist'].transform('min')

    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_inv_dist_min_diff'] = df[f'molecule_atom_1_inv_dist_min'] - df['inv_dist']

    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_inv_min_div'] = df[f'molecule_atom_1_inv_dist_min'] / df['inv_dist']

    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_inv_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['inv_dist'].transform('std')

    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_atom_1_inv_dist_std_diff'] = df[f'molecule_atom_1_inv_dist_std'] - df['inv_dist']

    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_inv_dist_std'] = df.groupby(['molecule_name', 'type_0'])['inv_dist'].transform('std')

    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_0_inv_dist_std_diff'] = df[f'molecule_type_0_inv_dist_std'] - df['inv_dist']

    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_inv_dist_mean'] = df.groupby(['molecule_name', 'type'])['inv_dist'].transform('mean')

    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_inv_dist_mean_diff'] = df[f'molecule_type_inv_dist_mean'] - df['inv_dist']

    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_inv_dist_mean_div'] = df[f'molecule_type_inv_dist_mean'] / df['inv_dist']

    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_inv_dist_max'] = df.groupby(['molecule_name', 'type'])['inv_dist'].transform('max')

    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_inv_dist_min'] = df.groupby(['molecule_name', 'type'])['inv_dist'].transform('min')

    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_inv_dist_std'] = df.groupby(['molecule_name', 'type'])['inv_dist'].transform('std')

    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']
    df[f'molecule_type_inv_dist_std_diff'] = df[f'molecule_type_inv_dist_std'] - df['inv_dist']

    df = reduce_mem_usage(df)
    return df

#now the boltzmann factor 
#e^-beta(dist_speedup)
#e^-beta(dist_speedup^2)
def boltzmann_ns(dist):
    
    boltz = np.exp(-beta*dist)
    return boltz

def boltzmann_sq(dist):
    boltz= np.exp(-beta*(dist**2))
    return boltz

#UGLY CONSTANT

#ATOMIC Mass
mass_dict = {"H":1.00794, "C":12.0107, "F":18.9984032, "O":15.9994, "N":14.0067}
#ATOMIC RADII
radii_dict = {"H":0.23, "C":0.68, "F":0.64, "O":0.68, "N":0.68}
#BOLTZMANN
kT = 0.59248490241  #k * 298.15  kcal/mol
beta = 1./kT

In [None]:
structures = pd.read_csv("../input/create-structure-file/structures_done.csv")
structures.head()
structures = reduce_mem_usage(structures)

In [None]:
#structures = dd.read_csv("../input/feature-engineering-1/structures_angles.csv")
#structures = pd.read_csv("../input/feature-engineering-1/structures_angles.csv")
#structures = reduce_mem_usage(structures)
#train = dd.read_csv("../input/champs-scalar-coupling/train.csv")
train = pd.read_csv("../input/champs-scalar-coupling/train.csv")
#train = reduce_mem_usage(train)
#test = dd.read_csv("../input/champs-scalar-coupling/test.csv")
test = pd.read_csv("../input/champs-scalar-coupling/test.csv")
#test  = reduce_mem_usage(test)

In [None]:
#select the types 
train.type.unique()

In [None]:
train = train.loc[train["type"]=="1JHC"]
test = test.loc[test["type"]=="1JHC"]

In [None]:
sd_train = pd.read_csv("../input/modelling-types-1-metafeatures-sd/1JHC_sd_oof.csv")
sd_test  = pd.read_csv("../input/modelling-types-1-metafeatures-sd/1JHC_sd_pred.csv")

sd_train.head()

In [None]:
dso_train = pd.read_csv("../input/modelling-types-1-metafeatures-dso/1JHC_dso_oof.csv")
dso_test  = pd.read_csv("../input/modelling-types-1-metafeatures-dso/1JHC_dso_pred.csv")

In [None]:
fc_train = pd.read_csv("../input/modelling-types-1-metafeatures-fc/1JHC_dso_oof.csv")
fc_test  = pd.read_csv("../input/modelling-types-1-metafeatures-fc/1JHC_dso_pred.csv")

In [None]:
fc_train.head()

In [None]:
train["sd"] = sd_train["target"]
test["sd"] = sd_test["prediction"]
train["dso"]= dso_train["target"]
test["dso"]=dso_test["prediction"]
train["fc"]= fc_train["target"]
test["fc"]=fc_test["prediction"]

In [None]:
train.head()

In [None]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
cat_cols = ["type","atom_0","max_dist_type_x", "id_max_overlap_types_x","min_dist_type_x","id_min_overlap_types_x",\
            "id_inv_max_overlap_types_x","max_inv_dist_type_x","min_inv_dist_type_x","id_inv_min_overlap_types_x",\
           "atom_1","max_dist_type_y", "id_max_overlap_types_y","min_dist_type_y","id_min_overlap_types_y",\
            "id_inv_max_overlap_types_y","max_inv_dist_type_yx","min_inv_dist_type_y","id_inv_min_overlap_types_y",\
           "max_inv_dist_type_y"]

In [None]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
train['inv_dist'] = 1.0/np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
test['inv_dist'] = 1.0/np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
train['inv_x'] = 1.0/(train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
test['inv_x'] = 1.0/(test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
train['inv_y'] = 1.0/(train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
test['inv_y'] = 1.0/(test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
train['inv_z'] = 1.0/(train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2
test['inv_z'] = 1.0/(test['z_0'] - test['z_1']) ** 2

train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])

train = create_features(train)
test = create_features(test)

#test  = test.merge(max_at_id,how="inner",on="molecule_name")

In [None]:
train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)


In [None]:
train.fillna(0, inplace=True)
test.fillna(0,inplace=True)

In [None]:
del structures

In [None]:
#train["atomic_mass_0"]=train.atom_0.apply(give_mass)
#train["atomic_mass_1"]= train.atom_1.apply(give_mass)
train["boltzmann_ns"]=train.dist.apply(boltzmann_ns)
train["boltzmann_sq"]=train.dist.apply(boltzmann_sq)
train["boltzmann_x_ns"]=train.dist_x.apply(boltzmann_ns)
train["boltzmann_x_sq"]=train.dist_x.apply(boltzmann_sq)
train["boltzmann_y_ns"]=train.dist_y.apply(boltzmann_ns)
train["boltzmann_y_sq"]=train.dist_y.apply(boltzmann_sq)
train["boltzmann_z_ns"]=train.dist_y.apply(boltzmann_ns)
train["boltzmann_z_sq"]=train.dist_y.apply(boltzmann_sq)

In [None]:
#test["atomic_mass_0"]=test.atom_0.apply(give_mass)
#test["atomic_mass_1"]= test.atom_1.apply(give_mass)
test["boltzmann_ns"]=test.dist.apply(boltzmann_ns)
test["boltzmann_sq"]=test.dist.apply(boltzmann_sq)
test["boltzmann_x_ns"]=test.dist_x.apply(boltzmann_ns)
test["boltzmann_x_sq"]=test.dist_x.apply(boltzmann_sq)
test["boltzmann_y_ns"]=test.dist_y.apply(boltzmann_ns)
test["boltzmann_y_sq"]=test.dist_y.apply(boltzmann_sq)
test["boltzmann_z_ns"]=test.dist_y.apply(boltzmann_ns)
test["boltzmann_z_sq"]=test.dist_y.apply(boltzmann_sq)

In [None]:
#append the angle info 
angles = pd.read_csv("../input/feature-engineering-1/structures_angles.csv")
angles = reduce_mem_usage(angles)
angles.head()

In [None]:
angles = angles.drop(labels=["x","y","z"],axis=1)
angles.head()

In [None]:
train = pd.merge(train, angles, how="left", left_on=["molecule_name","atom_index_0"], right_on=["molecule_name","atom_index"])
test = pd.merge(test,angles, how="left",left_on=["molecule_name","atom_index_0"],right_on=["molecule_name","atom_index"])

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
#log_e(avg_summ_elec_atom_x *   max_dist_x) and 
# * max_overlaps_x, min_dist_x, min_overlaps_x, avg_dist_x, avg_overlaps_x
train["log_elec_maxoverlap_x"]=( train["avg_summ_elec_atom_x"]*train["max_overlaps_x"])
train["log_elec_minoverlap_x"]=( train["avg_summ_elec_atom_x"]*train["min_overlaps_x"])
train["log_elec_avgdist_x"]=( train["avg_summ_elec_atom_x"]*train["avg_dist_x"])
train["log_elec_avgoverlaps_x"]=( train["avg_summ_elec_atom_x"]*train["avg_overlaps_x"])

train["log_elec_maxoverlap_y"]=( train["avg_summ_elec_atom_y"]*train["max_overlaps_y"])
train["log_elec_minoverlap_y"]=( train["avg_summ_elec_atom_y"]*train["min_overlaps_y"])
train["log_elec_avgdist_y"]=( train["avg_summ_elec_atom_y"]*train["avg_dist_y"])
train["log_elec_avgoverlaps_y"]=( train["avg_summ_elec_atom_y"]*train["avg_overlaps_y"])


In [None]:
#log_e(avg_summ_elec_atom_x *   max_dist_x) and 
# * max_overlaps_x, min_dist_x, min_overlaps_x, avg_dist_x, avg_overlaps_x
test["log_elec_maxoverlap_x"]=( test["avg_summ_elec_atom_x"]*test["max_overlaps_x"])
test["log_elec_minoverlap_x"]=( test["avg_summ_elec_atom_x"]*test["min_overlaps_x"])
test["log_elec_avgdist_x"]=( test["avg_summ_elec_atom_x"]*test["avg_dist_x"])
test["log_elec_avgoverlaps_x"]=( test["avg_summ_elec_atom_x"]*test["avg_overlaps_x"])

test["log_elec_maxoverlap_y"]=( test["avg_summ_elec_atom_y"]*test["max_overlaps_y"])
test["log_elec_minoverlap_y"]=( test["avg_summ_elec_atom_y"]*test["min_overlaps_y"])
test["log_elec_avgdist_y"]=( test["avg_summ_elec_atom_y"]*test["avg_dist_y"])
test["log_elec_avgoverlaps_y"]=( test["avg_summ_elec_atom_y"]*test["avg_overlaps_y"])


In [None]:
train["diff_elec_maxoverlap"]=(train["log_elec_maxoverlap_x"]- train["log_elec_maxoverlap_y"])
train["diff_elec_minoverlap"]=(train["log_elec_minoverlap_x"]- train["log_elec_minoverlap_y"])
train["diff_elec_avgdist"]=(train["log_elec_avgdist_x"]- train["log_elec_avgdist_y"])
train["diff_elecavgoverlaps"]=(train["log_elec_avgoverlaps_x"]- train["log_elec_avgoverlaps_y"])
train.head()

In [None]:
test["diff_elec_maxoverlap"]=(test["log_elec_maxoverlap_x"]- test["log_elec_maxoverlap_y"])
test["diff_elec_minoverlap"]=(test["log_elec_minoverlap_x"]- test["log_elec_minoverlap_y"])
test["diff_elec_avgdist"]=(test["log_elec_avgdist_x"]- test["log_elec_avgdist_y"])
test["diff_elecavgoverlaps"]=(test["log_elec_avgoverlaps_x"]- test["log_elec_avgoverlaps_y"])
test.head()

In [None]:
#log_e(avg_summ_elec_atom_x *   max_dist_x) and 
# * max_overlaps_x, min_dist_x, min_overlaps_x, avg_dist_x, avg_overlaps_x
train["ratio_elec_maxoverlap_x"]=( train["avg_summ_elec_atom_x"]/train["max_overlaps_x"])
train["ratio_elec_minoverlap_x"]=( train["avg_summ_elec_atom_x"]/train["min_overlaps_x"])
train["ratio_elec_avgdist_x"]=( train["avg_summ_elec_atom_x"]/train["avg_dist_x"])
train["ratio_elec_avgoverlaps_x"]=( train["avg_summ_elec_atom_x"]/train["avg_overlaps_x"])

train["ratio_elec_maxoverlap_y"]=( train["avg_summ_elec_atom_y"]/train["max_overlaps_y"])
train["ratio_elec_minoverlap_y"]=( train["avg_summ_elec_atom_y"]/train["min_overlaps_y"])
train["ratio_elec_avgdist_y"]=( train["avg_summ_elec_atom_y"]/train["avg_dist_y"])
train["ratio_elec_avgoverlaps_y"]=( train["avg_summ_elec_atom_y"]/train["avg_overlaps_y"])

train["diff_ratio_elec_maxoverlap"]=(train["ratio_elec_maxoverlap_x"]- train["ratio_elec_maxoverlap_y"])
train["diff_ratio_elec_minoverlap"]=(train["ratio_elec_minoverlap_x"]- train["ratio_elec_minoverlap_y"])
train["diff_ratio_elec_avgdist"]=(train["ratio_elec_avgdist_x"]- train["ratio_elec_avgdist_y"])
train["diff_ratio_elecavgoverlaps"]=(train["ratio_elec_avgoverlaps_x"]- train["ratio_elec_avgoverlaps_y"])
train.head()


In [None]:
#log_e(avg_summ_elec_atom_x *   max_dist_x) and 
# * max_overlaps_x, min_dist_x, min_overlaps_x, avg_dist_x, avg_overlaps_x
test["ratio_elec_maxoverlap_x"]=( test["avg_summ_elec_atom_x"]/test["max_overlaps_x"])
test["ratio_elec_minoverlap_x"]=( test["avg_summ_elec_atom_x"]/test["min_overlaps_x"])
test["ratio_elec_avgdist_x"]=( test["avg_summ_elec_atom_x"]/test["avg_dist_x"])
test["ratio_elec_avgoverlaps_x"]=( test["avg_summ_elec_atom_x"]/test["avg_overlaps_x"])

test["ratio_elec_maxoverlap_y"]=( test["avg_summ_elec_atom_y"]/test["max_overlaps_y"])
test["ratio_elec_minoverlap_y"]=( test["avg_summ_elec_atom_y"]/test["min_overlaps_y"])
test["ratio_elec_avgdist_y"]=( test["avg_summ_elec_atom_y"]/test["avg_dist_y"])
test["ratio_elec_avgoverlaps_y"]=( test["avg_summ_elec_atom_y"]/test["avg_overlaps_y"])

test["diff_ratio_elec_maxoverlap"]=(test["ratio_elec_maxoverlap_x"]- test["ratio_elec_maxoverlap_y"])
test["diff_ratio_elec_minoverlap"]=(test["ratio_elec_minoverlap_x"]- test["ratio_elec_minoverlap_y"])
test["diff_ratio_elec_avgdist"]=(test["ratio_elec_avgdist_x"]- test["ratio_elec_avgdist_y"])
test["diff_ratio_elecavgoverlaps"]=(test["ratio_elec_avgoverlaps_x"]- test["ratio_elec_avgoverlaps_y"])
test.head()


In [None]:
#try yo drop the coords 
train.drop(labels=["x_0","y_0","z_0","x_1","y_1","z_1"], axis=1, inplace=True)
test.drop(labels=["x_0","y_0","z_0","x_1","y_1","z_1"], axis=1, inplace=True)

In [None]:
for f in cat_cols:
    if f in train.columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))
        
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
cols = ["dist_z", "std_inv_overlaps_x", "max_angle","diff_elec_maxoverlap",\
"mean_dihedral", "max_dihedral", "std_boltz_dihedral", "std_dihedral",\
"dist_y", "mean_boltz_dihedral", "std_inv_dist_x", "std_angle", "mean_angle",\
"min_angle", "dist_x", "diff_elecavgoverlaps","std_inv_dist_y","std_inv_overlaps_y",\
"min_dihedral","diff_elec_avgdist","std_overlaps_x","diff_ratio_elec_maxoverlap",
"mean_noboltz_dihedral","diff_ratio_elec_avgdist","std_devs_x","diff_elec_minoverlap",\
"std_noboltz_dihedral","mean_boltz_angle","molecule_atom_1_dist_std",
"std_boltz_angle","molecule_atom_1_inv_dist_std","std_noboltz_angle","molecule_atom_index_1_dist_std_div",\
"diff_ratio_elec_minoverlap","std_devs_y","std_overlaps_y","boltzmann_y_ns","avg_inv_dist_x",\
"fc","sd","dso","id","molecule_name", "scalar_coupling_constant"]

In [None]:
train_df = train[cols]

In [None]:
X = train_df.drop(['id', 'molecule_name', 'scalar_coupling_constant'],axis=1)#,'atom',"inv_x","inv_y","inv_z"], axis=1)
y = train_df['scalar_coupling_constant']

#X_test = test.drop(['id', 'molecule_name'],axis=1)#,'atom',"inv_x","inv_y","inv_z"], axis=1)

In [None]:

import tensorflow as tf
from keras.layers import Dense, Input, Activation
from keras.layers import BatchNormalization,Add,Dropout
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import callbacks
from keras import backend as K
from keras.layers.advanced_activations import LeakyReLU
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)

In [None]:
def plot_history(history, label):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Loss for %s' % label)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    _= plt.legend(['Train','Validation'], loc='upper left')
    plt.show()

def create_nn_model(input_shape):
    inp = Input(shape=(input_shape,))
    x = Dense(256)(inp)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.05)(x)
    x = Dropout(0.4)(x)
    x = Dense(1024)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.05)(x)
    x = Dropout(0.2)(x)
    x = Dense(1024)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.05)(x)
    x = Dropout(0.2)(x)
    x = Dense(512)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.05)(x)
    x = Dropout(0.4)(x)
    x = Dense(512)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.05)(x)
    #x = Dropout(0.4)(x)
    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.05)(x)
    x = Dropout(0.4)(x)
    out2 = Dense(1, activation="linear")(x)#1 vector scalar coupling
    model = Model(inputs=inp, outputs=[out2])
    return model

In [None]:
cv_score=[]
cv_score_total=0
epoch_n = 300
verbose = 0
batch_size = 2048
    
# Set to True if we want to train from scratch.  False will reuse saved models as a starting point.
retrain =True


# Set up GPU preferences
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 2} ) 
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.6
sess = tf.Session(config=config) 
K.set_session(sess)


# import pickle
# pickle.dump(df_train, open('train_ds.pkl', 'wb'))

X.fillna(0, inplace=True)
# Standard Scaler from sklearn does seem to work better here than other Scalers
input_data = StandardScaler().fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
target_data=y.values

#following parameters should be adjusted to control the loss function
#if all parameters are zero, attractors do not work. (-> simple neural network)

# Simple split to provide us a validation set to do our CV checks with
train_index, cv_index = train_test_split(np.arange(len(X)),random_state=111, test_size=0.1)

# Split all our input and targets by train and cv indexes
train_input=input_data[train_index]
cv_input=input_data[cv_index]
train_target_1=target_data[train_index]
cv_target_1=target_data[cv_index]

# Build the Neural Net
nn_model=create_nn_model(train_input.shape[1])

# If retrain==False, then we load a previous saved model as a starting point.
if not retrain:
    nn_model = load_model(model_name_rd)

nn_model.compile(loss='mae', optimizer=Adam())#, metrics=[auc])

# Callback for Early Stopping... May want to raise the min_delta for small numbers of epochs
es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=8,verbose=1, mode='auto', restore_best_weights=True)
# Callback for Reducing the Learning Rate... when the monitor levels out for 'patience' epochs, then the LR is reduced
rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,patience=7, min_lr=1e-6, mode='auto', verbose=1)
# Save the best value of the model for future use

history = nn_model.fit(train_input,[train_target_1], 
        validation_data=(cv_input,[cv_target_1]), 
        callbacks=[es, rlr], epochs=epoch_n, batch_size=batch_size, verbose=1)

cv_predict=nn_model.predict(cv_input)
plot_history(history, mol_type)

accuracy=np.mean(np.abs(cv_target_1-cv_predict[:,0]))
cv_score.append(np.log(accuracy))
cv_score_total+=np.log(accuracy)

# Predict on the test data set using our trained model
test_predict=nn_model.predict(test_input)

# for each molecule type we'll grab the predicted values
#test_prediction[df_test["type"]==mol_type]=test_predict[0][:,0]
K.clear_session()

#cv_score_total/=len(mol_types)

In [None]:

accuracy=np.mean(np.abs(cv_target_1-cv_predict[:,0]))
cv_score.append(np.log(accuracy))
cv_score_total+=np.log(accuracy)


In [None]:
accuracy

In [None]:
cv_score