In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import time
import datetime

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import os
print(os.listdir("./input"))

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import KFold


['dipole_moments.csv', 'magnetic_shielding_tensors.csv', 'mulliken_charges.csv', 'potential_energy.csv', 'sample_submission.csv', 'scalar_coupling_contributions.csv', 'structures.csv', 'structures.zip', 'test.csv', 'train.csv']


In [2]:
train = pd.read_csv('./input/train.csv', index_col='id')
test = pd.read_csv('./input/test.csv', index_col='id')

display(train.head())

  mask |= (ar1 == a)


Unnamed: 0_level_0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [3]:
structures = pd.read_csv('./input/structures.csv')
display(structures.head())

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [4]:
# Map the atom structure data into train and test files
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [5]:
display(train.head())

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001


Distance

In [6]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)

train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

#Type Read more here - https://www.kaggle.com/artgor/molecular-properties-eda-and-models You can find more info there
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])
train['type_1'] = train['type'].apply(lambda x: x[1:])
test['type_1'] = test['type'].apply(lambda x: x[1:])

In [7]:
def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    """
    fast metric from https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def get_score(y_true, y_pred):
    return group_mean_log_mae(y_true, y_pred, train['type'])

In [8]:
display(train.head())

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist,dist_x,dist_y,dist_z,type_0,type_1
0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,1.091953,0.00022,1.192105,3.6e-05,1,JHC
1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1.78312,1.019253,2.160261,3e-06,2,JHH
2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,1.783147,0.294812,2.112831,0.771973,2,JHH
3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,1.783157,0.276638,2.085032,0.817978,2,JHH
4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.091952,1.049455,0.142844,6e-05,1,JHC


In [9]:
molecules = train.pop('molecule_name')
test = test.drop('molecule_name', axis=1)

y = train.pop('scalar_coupling_constant')

# Label Encoding
for f in ['type', 'atom_0', 'atom_1', 'type_0', 'type_1']:
    lbl = LabelEncoder()
    lbl.fit(list(train[f].values) + list(test[f].values))
    train[f] = lbl.transform(list(train[f].values))
    test[f] = lbl.transform(list(test[f].values))

#cat_features = ['type', 'atom_0', 'atom_1', 'type_0', 'type_1']

In [10]:
display(train.head())

Unnamed: 0,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist,dist_x,dist_y,dist_z,type_0,type_1
0,1,0,0,0,0.00215,-0.006031,0.001976,0,-0.012698,1.085804,0.008001,1.091953,0.00022,1.192105,3.6e-05,0,0
1,1,2,3,0,0.00215,-0.006031,0.001976,1,1.011731,1.463751,0.000277,1.78312,1.019253,2.160261,3e-06,1,1
2,1,3,3,0,0.00215,-0.006031,0.001976,1,-0.540815,1.447527,-0.876644,1.783147,0.294812,2.112831,0.771973,1,1
3,1,4,3,0,0.00215,-0.006031,0.001976,1,-0.523814,1.437933,0.906397,1.783157,0.276638,2.085032,0.817978,1,1
4,2,0,0,0,1.011731,1.463751,0.000277,0,-0.012698,1.085804,0.008001,1.091952,1.049455,0.142844,6e-05,0,0


In [11]:
yoof = np.zeros(len(train))
prediction = np.zeros(len(test))

n_fold = 3
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

for fold_n, (train_index, valid_index) in enumerate(folds.split(train)):
    print(f'Fold {fold_n + 1} started at {time.ctime()}')
    
    columns = train.columns
    X_train, X_valid = train[columns].iloc[train_index], train[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    model = CatBoostRegressor(eval_metric='MAE', 
                              loss_function='MAE',
                              learning_rate=1,
                              max_depth=13,
                              n_estimators=1000)
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)
    
    y_pred_valid = model.predict(X_valid)
    yoof[valid_index] = y_pred_valid.reshape(-1,)
    prediction += model.predict(test)

prediction /= n_fold

get_score(y, yoof)

Fold 1 started at Wed Jul 10 23:05:19 2019
Fold 2 started at Thu Jul 11 02:34:16 2019
Fold 3 started at Thu Jul 11 06:06:14 2019


0.48382367679724636

For basic model (RandomForestRegressor) I have score = 0.7359028441092138  
For basic model (CatBoostRegressor with 100 itr) I have score = 1.901559536331473  
For basic model (CatBoostRegressor with 1000 itr) I have score = 1.428464893938429  
For basic model (CatBoostRegressor with 100 itr and learning_rate=0.2) I have score = 1.694573105272936  
For basic model (CatBoostRegressor with 100 itr and learning_rate=1) I have score = 1.2947936824780637  
For basic model (CatBoostRegressor with 100 itr and learning_rate=1 and max_depth=7) I have score = 1.2835000710486957  
For basic model (CatBoostRegressor with 100 itr and learning_rate=1 and max_depth=13) I have score = 1.227871170653178  
For basic model (CatBoostRegressor with 100 itr and learning_rate=1 and max_depth=13 withon catfeachers) I have score = 1.0016903454005388  
For basic model (CatBoostRegressor with 1000 itr and learning_rate=1 and max_depth=13 withon catfeachers) I have score = 0.4975116615340085

For basic model (CatBoostRegressor with 10000 itr and learning_rate=1 and max_depth=13 withon catfeachers) I have score = 0.48382367679724636
Fold 1 started at Wed Jul 10 23:05:19 2019  
Fold 2 started at Thu Jul 11 02:34:16 2019  
Fold 3 started at Thu Jul 11 06:06:14 2019  

In [12]:
sample_submission = pd.read_csv('./input/sample_submission.csv', index_col='id')

benchmark = sample_submission.copy()
benchmark['scalar_coupling_constant'] = prediction
benchmark.to_csv('atomic_distance_benchmark.csv')