# Naive Kernel for Magnetic Interaction Prediction
This is a work-in-progress kernel.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import HuberRegressor
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'mulliken_charges.csv', 'dipole_moments.csv', 'train.csv', 'structures.csv', 'magnetic_shielding_tensors.csv', 'potential_energy.csv', 'sample_submission.csv', 'scalar_coupling_contributions.csv', 'structures']


## Load data sets

In [2]:
trainSet = pd.read_csv('../input/train.csv')
display(trainSet.head())

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [3]:
testSet = pd.read_csv('../input/test.csv')
display(testSet.head())

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type
0,4658147,dsgdb9nsd_000004,2,0,2JHC
1,4658148,dsgdb9nsd_000004,2,1,1JHC
2,4658149,dsgdb9nsd_000004,2,3,3JHH
3,4658150,dsgdb9nsd_000004,3,0,1JHC
4,4658151,dsgdb9nsd_000004,3,1,2JHC


In [4]:
structures = pd.read_csv('../input/structures.csv')
display(structures.head())

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


### Atomic distance
https://www.kaggle.com/inversion/atomic-distance-benchmark/

In [5]:
# Map the atom structure data into train and test files

def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

trainSet = map_atom_info(trainSet, 0)
trainSet = map_atom_info(trainSet, 1)

testSet = map_atom_info(testSet, 0)
testSet = map_atom_info(testSet, 1)


In [6]:
display(trainSet.head())
display(testSet.head())

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.0,1.0,C,0.599539,0.0,1.0
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.661639,0.0,1.0,C,-0.599539,0.0,1.0
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.661639,0.0,1.0,H,1.661639,0.0,1.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.661639,0.0,1.0,C,0.599539,0.0,1.0
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.0,1.0,C,-0.599539,0.0,1.0


In [7]:
# https://www.kaggle.com/jazivxt/all-this-over-a-dog
# https://www.kaggle.com/artgor/molecular-properties-eda-and-models
train_p0 = trainSet[['x_0', 'y_0', 'z_0']].values
train_p1 = trainSet[['x_1', 'y_1', 'z_1']].values
test_p0 = testSet[['x_0', 'y_0', 'z_0']].values
test_p1 = testSet[['x_1', 'y_1', 'z_1']].values

trainSet['dist'] = np.linalg.norm(train_p0 - train_p1, axis=1)
testSet['dist'] = np.linalg.norm(test_p0 - test_p1, axis=1)

trainSet['dist_to_type_mean'] = trainSet['dist'] / trainSet.groupby('type')['dist'].transform('mean')
testSet['dist_to_type_mean'] = testSet['dist'] / testSet.groupby('type')['dist'].transform('mean')

### Atom types

In [8]:
# All atom_0 are hydrogens
assert all(trainSet["atom_0"].astype('category').cat.categories == ['H'])
assert all(testSet["atom_0"].astype('category').cat.categories == ['H'])

In [9]:
# atom_1 are carbon, hydrogen or nitrogen
print(trainSet["atom_1"].astype('category').cat.categories)
print(testSet["atom_1"].astype('category').cat.categories)

Index(['C', 'H', 'N'], dtype='object')
Index(['C', 'H', 'N'], dtype='object')


In [10]:
# We use the interaction types, that already include the type of atoms involved
print(testSet["type"].astype('category').cat.categories)
print(trainSet["type"].astype('category').cat.categories)

Index(['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], dtype='object')
Index(['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN'], dtype='object')


In [11]:
for i in trainSet["type"].astype('category').cat.categories.values:
    trainSet['type_'+str(i)] = (trainSet['type'] == i)
    testSet['type_'+str(i)] = (testSet['type'] == i)

## Huber regression
Robust linear regression (tries to ignore outliers)

In [12]:
model = HuberRegressor()

In [13]:
# Features to include (regressors)
regressors = ['type_1JHC', 'type_1JHN', 'type_2JHC', 'type_2JHH', 'type_2JHN', 
                                       'type_3JHC', 'type_3JHH', 'dist', 'dist_to_type_mean']

In [14]:
# Add bias, interaction term and quadratic and cubic terms
polyFeat = PolynomialFeatures(degree=3, interaction_only=False, include_bias=True)

In [15]:
trainX = polyFeat.fit_transform(np.array(trainSet[regressors]))

In [17]:
# Some features are uninformative:
# Interaction type features don't (statistically) interact as they are mutually exclusive
usefulFeatures = [i for i,x in enumerate(np.abs(np.sum(trainX, axis = 0))) if x > 0]
trainX = trainX[:,usefulFeatures]
trainX.shape

(4658147, 80)

In [18]:
# NB: no need to include type_3JHN as this is redundant: this is always true when all other types are false
fitDist = model.fit(trainX, 
                    trainSet['scalar_coupling_constant'])

In [19]:
# Display factors to learn what is important for the prediction
fitDist.coef_

array([ 3.63569422,  5.84504608,  2.09238279, -1.8242938 , -1.74029367,
       -0.3457709 ,  0.16912838,  0.69360345,  0.05552272,  3.97611851,
        5.84504608,  6.11673802,  5.59679817,  2.09238279,  2.09527947,
        2.06866565, -1.8242938 , -2.17663709, -0.99371819, -1.74029367,
       -2.67520433, -1.5072463 , -0.3457709 , -0.89100497, -0.41715252,
        0.16912838, -0.09581594, -0.03111868,  0.69360345,  0.46723723,
        0.17291493, -4.93992349,  2.12156263,  4.75937427,  5.84504608,
        6.11673802,  5.59679817,  6.39253145,  5.84914839,  5.35195441,
        2.09238279,  2.09527947,  2.06866565,  2.09795829,  2.07131044,
        2.04500106, -1.8242938 , -2.17663709, -0.99371819, -0.6657286 ,
       -0.3039306 , -0.13875596, -1.74029367, -2.67520433, -1.5072463 ,
       -3.99272563, -2.24955562, -1.26743006, -0.3457709 , -0.89100497,
       -0.41715252, -2.22822347, -1.0432142 , -0.48841415,  0.16912838,
       -0.09581594, -0.03111868,  0.33275323,  0.10807014,  0.03

### Evaluate performance


In [22]:
# See https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [23]:
group_mean_log_mae(trainSet['scalar_coupling_constant'], 
                   model.predict(trainX), trainSet['type'])

1.1736439116975004

In [24]:
# Control: this should perform better than outputing the same overfitted value for all interactions
print(group_mean_log_mae(trainSet['scalar_coupling_constant'], trainSet['scalar_coupling_constant'].median(), trainSet['type']))
print(group_mean_log_mae(trainSet['scalar_coupling_constant'], 0.85, trainSet['type']))

1.9912352863119418
1.9398503397862858


In [27]:
testX = polyFeat.transform(np.array(testSet[regressors]))[:,usefulFeatures]
resultSet = pd.DataFrame( { "id" : testSet['id'],
                            "scalar_coupling_constant" : model.predict(testX)} )

## Export results

In [28]:
resultSet.to_csv("results.csv", index = False, header = True)

In [29]:
# Check content of the output file
with open("results.csv", "r") as f:
    for i, line in enumerate(f):
        print(line)
        if i > 5:
            break

id,scalar_coupling_constant

4658147,-0.06845633415777419

4658148,85.34186653935089

4658149,7.898494382693768

4658150,85.34186653935089

4658151,-0.06845633415777419

4658152,89.46557882396796

