# Summary

This kernel uses `DScribe`'s descriptor generation to get features for Champs. I refered [@educatedguessing's kernel here](https://www.kaggle.com/educatedguessing/diy-atom-centered-symmetry-functions) to generate the labels for the features. In his/her kernel, the feature generation is querying the `structures` dataframe, here I use `xyz` file reading, thus much faster.

The features generated are: **Atom-centered Symmetry Functions (ACSF)**, the columns inovling Fluorine are removed from get-go to avoid taking too much memory.

ACSFs are atom-based, in the actual features for the two atoms in a coupling pair, I used:

- Simple mapping the features to each atom to `acsf_feat_x` for the Hydrogen atom, `acsf_feat_y` for the Carbon/Nitrogen/a second Hydrogen atom.
- Charged-weighted harmonic mean $ab/(a+b)$, geometric mean $\sqrt{ab}$ of the two. The charge weighted was suggested in [WACSF - Weighted Atom-Centered Symmetry Functions as Descriptors in Machine Learning Potentials](https://arxiv.org/abs/1712.05861). Since ACSFs all have exponential weights, it is not sensible to use simple arithmetic mean.
- Various distance of $G^2$ and $G^4$ columns for the same atom combinations.

These features alone, together with feature selection techniques, are able to achieve -2.05 in the private LB using LGBM.

Reference:
- [DIY: Atom-Centered Symmetry Functions](https://www.kaggle.com/educatedguessing/diy-atom-centered-symmetry-functions)

In [None]:
import pandas as pd
import numpy as np
from numpy.linalg import eig
from numpy.linalg import svd
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook, tqdm
import gc
import os

import time, copy

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
import ase
from ase import Atoms
import ase.visualize
from ase.build import molecule

In [None]:
from dscribe.descriptors import ACSF, LMBTR, SOAP

In [None]:
def view(molecule):
    # Select a molecule
    mol = structures[structures['molecule_name'] == molecule]
    
    # Get atomic coordinates
    xcart = mol.iloc[:, 3:].values
    
    # Get atomic symbols
    symbols = mol.iloc[:, 2].values
    
    # Display molecule
    system = Atoms(positions=xcart, symbols=symbols)
    print('Molecule Name: %s.' %molecule)
    return ase.visualize.view(system, viewer="x3d")

In [None]:
structures = pd.read_csv('../input/structures.csv')

In [None]:
all_molecule_names = structures['molecule_name'].unique()

In [None]:
_DEBUG = True
folder = '../input/structures/'

# Testing I/O for one file

In [None]:
%%time
f = open(f"{folder}/dsgdb9nsd_133885.xyz")
positions = []
symbols = []
for row, line in enumerate(f):
    print(row, line.replace('\n', ''))
    
    fields = line.split(' ')
    if row < 2:
        continue
    # Then rows of atomic positions and chemical symbols.
    else:
        positions.append(fields[1:4])
        print(f"{fields[0]} -> {symbols}\n")
        symbols.append(fields[0])       

print(len(symbols))

# ACSF generation examples

## Feature count

Let $N := (\# \,\text{unique atoms})$ which is the minimum among the number of `species` argument in `ACSF` and the actual unique atoms in a molecule, then the total number of features for *one* atom is:

$$
N\times \Big(\underbrace{1}_{G^1}
+ \underbrace{(\#\, \eta, R_S \text{ couplets})}_{G^2}\Big)
+ \underbrace{N(N+1)/2\times (\#\, \eta, \zeta, \lambda \text{ triplets})}_{G^4}
$$

For the $i$-th atom, let $\mathcal{S}$ be set of atoms of the same symbol other than this atom of interest:
$$G_{i}^{1}=\sum_{j\in\mathcal{S} }\; f_{\mathrm{c}}\left(R_{i j}\right),$$
and similarly,
$$G_{i}^{2}=\sum_{j\in \mathcal{S}}\; e^{-\eta\left(R_{i j}-R_{s}\right)^{2}} \cdot f_{c}\left(R_{i j}\right).$$
$$
 G_{i}^{4}= 2^{1-\zeta} \sum_{j, k \in \mathcal{S}\backslash\{ i\} }^{\text { all }}\left(1+\lambda \cos \theta_{i j k}\right)^{\zeta} \cdot e^{-\eta\left(R_{i j}^{2}+R_{i k}^{2}+R_{j k}^{2}\right)} \cdot f_{\mathrm{c}}\left(R_{i j}\right) \cdot f_{\mathrm{c}}\left(R_{i k}\right) \cdot f_{\mathrm{c}}\left(R_{j k}\right) 
$$

The arrangement of the features is by the feature being radial or angular, then by the `species` parameter. First $G^1$ with $G^2$ for `H`, `C`, `N`, `O`, `F`, then the $G^4$ for every two combinations of atom in `H`, `C`, `N`, `O`, `F`.



### Testing G^1

Example 1: water

In [None]:
#For all ACSF functions R_c
rcut = 10.0

species = ["H", "C", "N", "O", "F"]

g2_params = None
g4_params = None

g3_params = None
g5_params = None

acsf = ACSF(
    species=species, # order of atomic number
    rcut=rcut,
    g2_params=g2_params,
    g4_params=g4_params,
)

In [None]:
water = molecule("H2O")

# Create ACSF output
acsf_water = acsf.create(water, positions=[0,1,2])

print(acsf_water.shape, "\n")
print(water.get_all_distances(), "\n")
print(f"Atom O: {acsf_water[0]} \n")
print(f"Atom H: {acsf_water[1]} \n")
print(f"Atom H: {acsf_water[2]} \n")

### Testing G^2

In [None]:
#G2 - eta/R_s couples:
g2_params = [[0.5, 2], [0.1, 2]]

#G4 - eta/zeta/lambda triplets:
# g4_params = [[1, 4,  1], [0.1, 4,  1], [0.01, 4,  1]]

acsf = ACSF(
    species=["H", "C", "N", "O", "F"], # order of atomic number
    rcut=rcut,
    g2_params=g2_params,
    g4_params=None,
)

In [None]:
acsf_water = acsf.create(water, positions=[0,1,2])

print(acsf_water.shape, "\n")
print(water.get_all_distances(), "\n")
print(f"Atom O\n {acsf_water[0]} \n")
print(f"Atom H\n {acsf_water[1]} \n")
print(f"Atom H\n {acsf_water[2]} \n")

### Testing G^4

In [None]:
#G2 - eta/R_s couples:
g2_params = [[2, 1], [0.5, 1]]

#G4 - eta/zeta/lambda triplets:
g4_params = [[1, 4,  1], [0.1, 4,  1], [0.01, 4,  1]]

acsf = ACSF(
    species=species, # order of atomic number
    rcut=rcut,
    g2_params=g2_params,
    g4_params=g4_params,
)

In [None]:
acsf_water = acsf.create(water, positions=[0,1,2])

print(acsf_water.shape, "\n")
print(f"Atom O\n {acsf_water[0]} \n")
print(f"Atom H\n {acsf_water[1]} \n")
print(f"Atom H\n {acsf_water[2]} \n")

Example 2: formic acid

In [None]:
acid = molecule("HCOOH")
ase.visualize.view(acid, viewer="x3d")

In [None]:
# Create ACSF output
acsf_acid = acsf.create(acid)

In [None]:
k=1
print(acsf_acid.shape, "\n")
print(acid.get_atomic_numbers(),"\n")
print(acid.get_all_distances(), "\n")
print(f"All atoms {acid.get_chemical_symbols()}")
print(f"Atom {acid.get_chemical_symbols()[k]}:\n {acsf_acid[k]} \n")

## Labeling features

The unsung hero: [DIY: Atom-Centered Symmetry Functions](https://www.kaggle.com/educatedguessing/diy-atom-centered-symmetry-functions).

In [None]:
tmp_system = Atoms(species, [[0,0,0]]*len(species))

nr_to_symbol = {number:symbol for symbol, number in
                    zip(tmp_system.get_chemical_symbols(),tmp_system.get_atomic_numbers())
                    }
atomic_numbers = sorted(tmp_system.get_atomic_numbers())

label_feats = []

g_params={
        'g1': [rcut],
        'g2': g2_params,
        'g3': g3_params,
        'g4': g4_params,
        'g5': g5_params
    }

for atom_nr in atomic_numbers:
    atom_id = nr_to_symbol[atom_nr]
    
    for g in ["g1", "g2", "g3"]:
        params = g_params[g]
        if params is not None:
            for para in params:
                label_feats.append(f'acsf_{g}_{atom_id}_{para}')
                
for atom_nr in atomic_numbers:
    atom_id = nr_to_symbol[atom_nr]
    for i in range(0, atom_nr+1): # this is worth noting
        if i in atomic_numbers:
            atom_id_2 = nr_to_symbol[i]
            for g in ["g4","g5"]:
                params = g_params[g]
                if params is not None:
                    for para in params:
                        label_feats.append(f'acsf_{g}_{atom_id}_{atom_id_2}_{para}')

In [None]:
def create_feature_labels(species, rcut,
                          g2_params=None,
                          g3_params=None,
                          g4_params=None,
                          g5_params=None,
                          transform_to_symbols=True):
    #sub function to transform from atom numbers to chemical symbols
    def get_atom_id(atom_nr, transform_to_symbols):

        if transform_to_symbols == True:
            atom_id = nr_to_symbol[atom_nr]
        else:
            atom_id = atom_nr
        return atom_id

    feature_label = []

    g_params={
        'g1': [rcut],
        'g2': g2_params,
        'g3': g3_params,
        'g4': g4_params,
        'g5': g5_params
    }

    # create_atom_numbers -> symbol dict
    tmp_system = ase.Atoms(species, [[0,0,0]]*len(species))

    nr_to_symbol = {number:symbol for symbol, number in
                    zip(tmp_system.get_chemical_symbols(),tmp_system.get_atomic_numbers())
                    }

    atomic_numbers = sorted(tmp_system.get_atomic_numbers())

    for atom_nr in atomic_numbers:
        atom_id = get_atom_id(atom_nr, transform_to_symbols)
        for g in ["g1", "g2", "g3"]:
            params = g_params[g]
            if params is not None:
                for para in params:
                    feature_label.append(f'acsf_{g}_{atom_id}_{para}')

    for atom_nr in atomic_numbers:
        atom_id = get_atom_id(atom_nr, transform_to_symbols)
        for i in range(0, atom_nr+1):
            if i in atomic_numbers:
                atom_id_2 = get_atom_id(i, transform_to_symbols)
                for g in ["g4","g5"]:
                    params = g_params[g]
                    if params is not None:
                        for para in params:
                            feature_label.append(f'acsf_{g}_{atom_id}_{atom_id_2}_{para}')

    return feature_label

In [None]:
labels_tmp = create_feature_labels(["H", "C", "O", "N", "F"], rcut,
                          g2_params=g2_params,
                          g3_params=None,
                          g4_params=g4_params,
                          g5_params=None,
                          transform_to_symbols=True)

In [None]:
## sanity check
labels_tmp == label_feats

In [None]:
acid_feat = pd.DataFrame(acsf_acid, columns=label_feats)

In [None]:
acid_feat.head()

# Benchmark using few molecules

In [None]:
filenames = [folder + i + '.xyz' for i in all_molecule_names[:100]]

## Final parameters for CHAMPS competition

Based on the original paper: 

- JÃ¶rg Behler. Atom-centered symmetry functions for constructing high-dimensional neural network potentials. J. Chem. Phys., 134(7):074106, 2011.

I decided to use the following paramters to cover all three coupling types.

In [None]:
rcut = 9.0

species = ["H", "C", "N", "O", "F"]

#G2 - eta/R_s couples:
g2_params = [[5, 1], [2, 1], [0.5, 1],
             [5, 3], [2, 3], [0.5, 3]]

#G4 - eta/zeta/lambda triplets:
g4_params = [[0.5, 2,  1], [0.5, 6,  1], [0.5, 16,  1], 
             [0.05, 2,  1],[0.05, 6,  1], [0.05, 16,  1], 
             [0.5, 2, -1], [0.5, 6, -1], [0.5, 16, -1], 
             [0.05, 2, -1], [0.05, 6, -1], [0.05, 16, -1]]

acsf = ACSF(
    species=species, # order of atomic number
    rcut=rcut,
    g2_params=g2_params,
    g4_params=g4_params,
)

label_feats = create_feature_labels(["H", "C", "O", "N", "F"], rcut,
                          g2_params=g2_params,
                          g3_params=None,
                          g4_params=g4_params,
                          g5_params=None,
                          transform_to_symbols=True)

## Iterating in xyz file, numpy array then dataframe

In [None]:
%%time
images = []

#Open and parse file.
for filename in tqdm_notebook(filenames):
    # Define lists for output.
    positions = []
    symbols = []
    with open(filename) as f:
        for row, line in enumerate(f):
            fields = line.split(' ')
            # Each file contains a 3 line header.
            if row < 2:
                continue
            # Then rows of atomic positions and chemical symbols.
            else:
                positions.append(fields[1:4])
                symbols.append(fields[0])
    # Make an atoms object from each file.
    atom = Atoms(positions=np.array(positions, dtype=float),
                  symbols=symbols)
    features = acsf.create(atom, n_jobs=2) 
    # structure of return is [[#acsf features] for each position in molecule_system]
    images.append(features)

structures_acsf = pd.DataFrame(np.concatenate(images),columns=label_feats)
print(structures_acsf.shape,'\n')

## Query the dataframe to generate ACSF

Slow. Slow. Slow.

https://www.kaggle.com/educatedguessing/diy-atom-centered-symmetry-functions

In [None]:
def calculate_symmetric_functions(df_structure, rcut, g2_params=None,
                                  g3_params=None,
                                  g4_params=None,
                                  g5_params=None):

    species = ["H", "C", "O", "N", "F"]

    acsf = ACSF(
        species=species,
        rcut=rcut,
        g2_params=g2_params,
        g3_params=g3_params,
        g4_params=g4_params,
        g5_params=g5_params,
    )

    structure_molecules = df_structure.molecule_name.unique()

    acsf_feature_labels = create_feature_labels(species=species,
                                                rcut=rcut,
                                                g2_params=g2_params,
                                                g3_params=g3_params,
                                                g4_params=g4_params,
                                                g5_params=g5_params,
                                                )

    df_structure= df_structure.reindex(columns = df_structure.columns.tolist() + acsf_feature_labels)

    df_structure = df_structure.sort_values(['molecule_name','atom_index'])

    acsf_structure_chunks = calculate_acsf_in_chunks(structure_molecules, df_structure, acsf, acsf_feature_labels)

    acsf_structure = pd.DataFrame().append(acsf_structure_chunks)

    return acsf_structure

def calculate_acsf_in_chunks(structure_molecules, 
                             df_structure, acsf, acsf_feature_labels, 
                             step_size=2000):

    mol_counter = 0
    max_counter = len(structure_molecules)
    all_chunks = []
    
    while mol_counter*step_size < max_counter:

        tmp_molecules = structure_molecules[mol_counter*step_size:(mol_counter+1)*step_size]

        tmp_structure = df_structure.loc[df_structure.molecule_name.isin(tmp_molecules),:].copy()

        tmp_results = calculate_acsf_multiple_molecules(tmp_molecules, tmp_structure, acsf, acsf_feature_labels)

        all_chunks.append(tmp_results.copy())

        mol_counter += 1

    return all_chunks


def calculate_acsf_multiple_molecules(molecule_names, df_structure, acsf, acsf_feature_labels):

    #acsf_feature_labels = [f'acsf_{nr}' for nr in range(0, acsf.get_number_of_features())]
    #df_molecules = df_structure.loc[df_structure.molecule_name.isin(molecule_names),:].copy()
    counter = 0
    for molecule_name in molecule_names:

        df_molecule = df_structure.loc[df_structure.molecule_name == molecule_name,:]
        acsf_values = calculate_acsf_single_molecule(df_molecule, acsf)


        df_structure.loc[df_structure.molecule_name==molecule_name, acsf_feature_labels] \
        = copy.copy(acsf_values)

        counter += 1

    return df_structure

def calculate_acsf_single_molecule(df_molecule, acsf):

    molecule_atoms = df_molecule.loc[:, 'atom']
    molecule_positions = df_molecule.loc[:, ['x','y','z']]

    molecule_system = Atoms(symbols=molecule_atoms, positions=molecule_positions)

    return acsf.create(molecule_system, n_jobs=2)

In [None]:
struct_small = structures.loc[structures.molecule_name.isin(all_molecule_names[:100])].copy()

In [None]:
%%time
struct_small = calculate_symmetric_functions(struct_small, 
                              rcut, 
                              g2_params=g2_params, 
                              g4_params=g4_params)

In [None]:
struct_small.head(5)

In [None]:
## sanity check
np.allclose(struct_small[label_feats], structures_acsf)

## Checking correlation of ACSF features

In [None]:
def get_correlated_cols(df,threshold=0.98):
    '''
    threshold: threshold to remove correlated variables
    '''
    
    # Absolute value correlation matrix
    corr_matrix = df.corr().abs()
    
    # Getting the upper triangle of correlations
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    
    # Select columns with correlations above threshold
    cols_to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    print('There are {} columns to remove.'.format(len(cols_to_drop)))
    return cols_to_drop

In [None]:
corr_cols = get_correlated_cols(structures_acsf, threshold=0.97)
print(corr_cols)

In [None]:
np.abs(structures_acsf.corrwith(structures_acsf[corr_cols[1]])).sort_values(ascending=False)[:10]

In [None]:
np.abs(structures_acsf.corrwith(structures_acsf.iloc[:,0])).sort_values(ascending=False)[:10]

# Generation for all molecules

In [None]:
filenames = [folder + i + '.xyz' for i in all_molecule_names]

In [None]:
%%time
images = []

#Open and parse file.
for filename in tqdm_notebook(filenames):
    # Define lists for output.
    positions = []
    symbols = []
    with open(filename) as f:
        for row, line in enumerate(f):
            fields = line.split(' ')
            # Each file contains a 3 line header.
            if row < 2:
                continue
            # Then rows of atomic positions and chemical symbols.
            else:
                positions.append(fields[1:4])
                symbols.append(fields[0])
    # Make an atoms object from each file.
    atom = Atoms(positions=np.array(positions, dtype=float),
                  symbols=symbols)
    features = acsf.create(atom, n_jobs=2) 
    # structure of return is [[#acsf features] for each position in molecule_system]
    images.append(features)

structures_acsf = pd.DataFrame(np.concatenate(images),columns=label_feats)
print('\n', structures_acsf.shape,'\n')

In [None]:
## drop Fluorine columns
F_columns = [col for col in structures_acsf.columns if '_F_' in col]
structures_acsf = structures_acsf.drop(columns=F_columns)

In [None]:
structures_acsf.to_csv('structures_acsf.csv', index=False)

# Addendum: feature generation functions

After mapping the `structures_acsf` to the `train` and `test`, the following the feature generation functions.

In [None]:
TOL = 1e-12

def get_chi2_distance(v1, v2):
    '''
    all columns must be non-negative
    compute the weighted Chi-square distance
    '''  
    diff = ((v1 - v2)**2)/(v1+v2+TOL)
    
    return diff.sum(axis=1)

def get_angular_distance(v1, v2):
    '''
    Compute the cosine distance along axis 1
    inputs: 2 n by m array
    '''
    
    cosine = (v1*v2).sum(axis=1)/(norm(v1,axis=1)*norm(v2,axis=1)+TOL)
    
    return cosine

def get_tanimoto_distance(v1, v2):
    '''
    Compute the Tanimoto similarity
    '''
    a = (v1*v2).sum(axis=1)
    b = (v1*v1).sum(axis=1)
    c = (v2*v2).sum(axis=1)
    
    return a/(b + c - a + TOL)
    

def add_acsf_features(df):
    
    acsf_cols = []
    for col in df.columns:
        if 'acsf' in col:
            acsf_cols.append(col)
            
    #### G1 difference features
    g1_cols = [col for col in acsf_cols if 'g1' in col]
    g1_cols_atom0 = [col for col in g1_cols if 'x' in col]
    g1_cols_atom1 = [col for col in g1_cols if 'y' in col]
    
    v1 = df[g1_cols_atom0].values
    v2 = df[g1_cols_atom1].values
    
    df['acsf_g1_diff'] = get_chi2_distance(v1, v2)
    df['acsf_g1_cos'] = get_angular_distance(v1, v2)
    df['acsf_g1_tanimoto'] = get_tanimoto_distance(v1, v2)
    
    #### G2 difference features
    g2_cols = [col for col in acsf_cols if 'g2' in col]
    for symbol in ['H', 'C', 'N', 'O', 'F']:
        
        g2_cols_atom0 = [col for col in g2_cols if 'x' in col if symbol in col]
        g2_cols_atom1 = [col for col in g2_cols if 'y' in col if symbol in col]
        
        v1 = df[g2_cols_atom0].values
        v2 = df[g2_cols_atom1].values
        
        df['acsf_g2_diff_'+str(symbol)] = get_chi2_distance(v1, v2)
        df['acsf_g2_cos_'+str(symbol)] = get_angular_distance(v1, v2)
        df['acsf_g2_tanimoto_'+str(symbol)] = get_tanimoto_distance(v1, v2)
        
        
    #### G4 difference features
    g4_cols = [col for col in acsf_cols if 'g4' in col]
    
    g4_pairs = []
    all_symbol = ['H', 'C', 'N', 'O' ]
    for i, s in enumerate(all_symbol):
        for j in range(i+1):
            g4_pairs.append(str(s)+'_'+str(all_symbol[j]))
            
    for pair in g4_pairs:
        
        g4_cols_atom0 = [col for col in g4_cols if 'x' in col if symbol in col]
        g4_cols_atom1 = [col for col in g4_cols if 'y' in col if symbol in col]
        
        v1 = df[g4_cols_atom0].values
        v2 = df[g4_cols_atom1].values
        
        df['acsf_g4_diff_'+str(pair)] = get_chi2_distance(v1, v2)
        df['acsf_g4_cos_'+str(pair)] = get_angular_distance(v1, v2)
        df['acsf_g4_tanimoto_'+str(pair)] = get_tanimoto_distance(v1, v2)
    
    return df

def add_prod_features(df, cols=None, weights=None):
    if weights is not None and isinstance(weights, pd.DataFrame):
        weights = weights.values
    if cols is not None:
        for col in cols:
            if col+'_x' in df.columns and col+'_y' in df.columns:
                df[col+'_prod'] = np.sqrt(weights[:,0]*weights[:,1]*df[col+'_x']*df[col+'_y'])
    return df

def add_mean_features(df, cols=None, weights=None):
    if weights is not None and isinstance(weights, pd.DataFrame):
        weights = weights.values
    if cols is not None:
        for col in cols:
            if col+'_x' in df.columns and col+'_y' in df.columns:
                val_atom_0 = weights[:,0]*df[col+'_x']
                val_atom_1 = weights[:,1]*df[col+'_y']
                val_atom_0 = np.abs(val_atom_0)
                val_atom_1 = np.abs(val_atom_1)
                val_atom_0[val_atom_0<1e-13] = 1e-13
                val_atom_1[val_atom_1<1e-13] = 1e-13
                df[col+'_hmean'] = hmean(np.c_[val_atom_0,val_atom_1], axis=1)
    return df