# Faster Distance Matrix  
Distance matrix between atoms of a single molecule are used in many ML algorithms.This notebook show a faster way to compute distance matrix reference to https://www.kaggle.com/cpmpml/ultra-fast-distance-matrix-computation

In [52]:
# tests help notebooks stay managable
import doctest
import copy
import functools

# docktesting
def autotest(func):
    globs = copy.copy(globals())
    globs.update({func.__name__: func})
    doctest.run_docstring_examples(
        func, globs, verbose=True, name=func.__name__)
    return func

In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")

In [54]:
@autotest
def distance_matrix(data):
    """
    :param data: data, pd.DataFrame
    :return: diantance matrix, np.ndarray
    >>> data = np.asarray( [[1,1,1],[2,2,2],[3,3,3]])
    >>> columns = ['x','y','z']
    >>> data = pd.DataFrame(data=data,columns=columns)
    >>> distance_matrix(data.values)
    array([[0., 3., 6.],
           [3., 0., 3.],
           [6., 3., 0.]])
    """
    rows = data.shape[0]
    data_tile = np.tile(data.T, (rows,1,1))
    dist_mat = np.sqrt((data_tile - data_tile.T)**2).sum(axis=1)
    return dist_mat    


Finding tests in distance_matrix
Trying:
    data = np.asarray( [[1,1,1],[2,2,2],[3,3,3]])
Expecting nothing
ok
Trying:
    columns = ['x','y','z']
Expecting nothing
ok
Trying:
    data = pd.DataFrame(data=data,columns=columns)
Expecting nothing
ok
Trying:
    distance_matrix(data.values)
Expecting:
    array([[0., 3., 6.],
           [3., 0., 3.],
           [6., 3., 0.]])
ok


In [55]:
data = np.asarray( [['dsgdb9nsd_000001',	0,	'C',	-0.012698,	1.085804,   0.008001],
                     ['dsgdb9nsd_000001',	1,	'H',	0.002150,	-0.006031,	0.001976],
                     ['dsgdb9nsd_000001',	2,	'H',	1.011731,	1.463751,	0.000277],
                     ['dsgdb9nsd_000001',	3,	'H',	-0.540815,	1.447527,	-0.876644],
                     ['dsgdb9nsd_000001',	4,	'H',	-0.523814,	1.437933,	0.906397],
                     ['dsgdb9nsd_000002',	0,	'N',	-0.040426,	1.024108,	0.062564],
                     ['dsgdb9nsd_000002',	1,	'H',	0.017257,	0.012545,	-0.027377]])

columns = ['molecule_name'	,'atom_index',	'atom',	'x','y','z']
data = pd.DataFrame(data=data,columns=columns).apply(pd.to_numeric, errors='ignore')

In [56]:
# covert DataFrame to np.ndarray is expensive, it cost losts of time
distance_matrix(data[['x','y','z']].values)

array([[0.      , 1.112708, 1.4101  , 1.774485, 1.761641, 0.143987,
        1.138592],
       [1.112708, 0.      , 2.481062, 2.875143, 2.874349, 1.133303,
        0.063036],
       [1.4101  , 2.481062, 0.      , 2.445691, 2.467483, 1.554087,
        2.473334],
       [1.774485, 2.875143, 2.445691, 0.      , 1.809636, 1.863016,
        2.842321],
       [1.761641, 2.874349, 2.467483, 1.809636, 0.      , 1.741046,
        2.900233],
       [0.143987, 1.133303, 1.554087, 1.863016, 1.741046, 0.      ,
        1.159187],
       [1.138592, 0.063036, 2.473334, 2.842321, 2.900233, 1.159187,
        0.      ]])