## 02725 HW 1 Molecular Graph kernels


## Imports

In [2]:
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
import py3Dmol
import numpy as np

##### Load the .mol files for Penicillin A, Penicillin G, and Caffeine

In [3]:
def read_mol_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

caffeine_mol_file = read_mol_file('caffeine.mol')
penicillinA_mol_file = read_mol_file('penicillinA.mol')
penicillinG_mol_file = read_mol_file('penicillinG.mol')
ethane_mol_file = read_mol_file('ethane.mol')

## 1.2 Computing Kernels 

### 1. Write a script to compute the adjacency matrix for a molecular graph given a V2000 MOL file. Indices should correspond to the implicit indices in the MOL file (for the ethane example, the first row/column in the adjacency matrix should represent the first carbon in the MOL file). You may temporarily ignore hydrogens that are not explicit (you don’t need to add implicit hydrogens yet). Show the adjacency matrix your code outputs for Caffeine.

In [5]:
def implicit_adjacency_matrix(mol_file):
    matrix = implicit_matrix(mol_file)
    for row in range(len(matrix)):
        for col in range(len(matrix[row])):
            if matrix[row][col] >1 :
                matrix[row][col] =  1
    return matrix


def implicit_matrix(mol_file):
    lines = mol_file.splitlines()
    counts_line = lines[3]
    num_atoms = int(counts_line.split()[0])
    num_bonds = int(counts_line.split()[1])

    adjacency_matrix = []
    for num_row in range(num_atoms):
        adjacency_matrix.append([])
        for num_col in range(num_atoms):
           adjacency_matrix[num_row].append(0) 

    bond_lines = lines[4 + num_atoms:4 + num_atoms + num_bonds]
    for line in bond_lines:
        first_atom = int(line.split()[0])
        second_atom = int(line.split()[1])
        bond_type = int(line.split()[2])
        adjacency_matrix[first_atom-1][second_atom-1] = bond_type
        adjacency_matrix[second_atom-1][first_atom-1] = bond_type

    return adjacency_matrix

print("Implicit adjacency matrix for Penicillin A:\n", np.array(implicit_adjacency_matrix(penicillinA_mol_file)).shape)
#print("Implicit adjacency matrix for Penicillin G:\n", np.array(implicit_adjacency_matrix(penicillinG_mol_file)))
print("Implicit adjacency matrix for Caffeine:\n", np.array(implicit_adjacency_matrix(caffeine_mol_file)))


Implicit adjacency matrix for Penicillin A:
 (24, 24)
Implicit adjacency matrix for Caffeine:
 [[0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 1 0 0 0]
 [0 1 0 0 1 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0]]


### 2. Make all implicit hydrogens explicit. You may assume nitrogen and sulfur always have valences 3 and 2 respectively. Give your hydrogens indices according to the indices of the heavy atom they’re connected to. For example, in ethane the hydrogens connected to carbon 1 will get indices 3, 4, and 5. Similarly carbon 2’s hydrogens will get indices 6, 7, and 9. If any hydrogens are alreadly explicit in the MOL file you do not need to reassign their indices. Show the new adjacency matrix for Caffeine.

In [None]:
def explicit_adjacency_matrix(mol_type, mol_file):
    
    lines = mol_file.splitlines()
    counts_line = lines[3]
    num_atoms = int(counts_line.split()[0])
    num_bonds = int(counts_line.split()[1])
    atom_lines = lines[4:4 + num_atoms]
    bond_lines = lines[4 + num_atoms:4 + num_atoms + num_bonds]
   
    
    valence = {'N': 3, 'S': 2, 'C': 4, 'O': 2, 'H': 1}
    matrix = implicit_matrix(mol_file)
    implicit_adj_mat = implicit_adjacency_matrix(mol_file)
    num_H = 0
    explicit_hydrogen_indices = {}
    for i, line in enumerate(atom_lines):
        atom_type = line.split()[3]
        if mol_type == True:
            if atom_type == "Na":
                continue
            elif i == 3:
                continue
        row_num = i
        current_num_bonds = sum(matrix[row_num])
        implicit_hydrogens = valence[atom_type] - current_num_bonds
        explicit_hydrogen_indices[i+1] = implicit_hydrogens
        num_H += implicit_hydrogens     
    
    adjacency_matrix_explicit = []
    for num_row in range(num_atoms+num_H):
        adjacency_matrix_explicit.append([])
        for num_col in range(num_atoms+num_H):
           adjacency_matrix_explicit[num_row].append(0) 
   
    for i in range(num_atoms):
        for j in range(num_atoms):
            adjacency_matrix_explicit[i][j] = implicit_adj_mat[i][j]

    hydrogen_index = num_atoms
    for atom_index, hydrogens in explicit_hydrogen_indices.items():
        atom_index = int(atom_index)
        for _ in range(hydrogens):
            adjacency_matrix_explicit[atom_index - 1][hydrogen_index] = 1 
            adjacency_matrix_explicit[hydrogen_index][atom_index - 1] = 1
            hydrogen_index += 1

    return adjacency_matrix_explicit

# print("Explicit adjacency matrix for Penicillin A:\n", np.array(explicit_adjacency_matrix(False, penicillinA_mol_file)))
# print("Explicit adjacency matrix for Penicillin G:\n", np.array(explicit_adjacency_matrix(True, penicillinG_mol_file)))
print("Implicit adjacency matrix for Caffeine:\n", np.array(explicit_adjacency_matrix(False, caffeine_mol_file)))


Implicit adjacency matrix for Caffeine:
 [[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
 [1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0

### 3.  Using your new adjacency matrix compute the following mappings discussed in lecture for Penicillin A, Penicillin G, and Caffeine

* Molecular formula (consider atoms C, O, N, H and S)

In [None]:
def calc_similarity_molecular_formula_mapping(mol, mol_type): 
    
    mol_explicit_matrix = explicit_adjacency_matrix(mol_type, mol)
    mol_lines = mol.splitlines()
    mol_counts_line = mol_lines[3]
    mol_num_atoms = int(mol_counts_line.split()[0])
    mol_atom_lines = mol_lines[4:4 + mol_num_atoms]
    
    mol_CONHS = {"C":0, "O":0, "N":0, "H":0, "S":0}
    for i, line in enumerate(mol_atom_lines):
        atom_type = str(line.split()[3])
        if mol_type == True:
            if atom_type == "Na":
                continue
        # print(atom_type)
        mol_CONHS[atom_type] += 1
      
    num_H_rows = len(mol_explicit_matrix) - len(mol_atom_lines)
    mol_CONHS["H"] += num_H_rows

    return list(mol_CONHS.values())

pennA_mol_formula = calc_similarity_molecular_formula_mapping(penicillinA_mol_file,  False)
print("Penniclin_A", pennA_mol_formula)
caffeine_mol_formula = calc_similarity_molecular_formula_mapping(caffeine_mol_file,  False)
print("Caffeine", caffeine_mol_formula)
pennG_mol_formula = calc_similarity_molecular_formula_mapping(penicillinG_mol_file,  True)
print("Penniclin_G", pennG_mol_formula)

    

Penniclin_A [16, 4, 2, 26, 1]
Caffeine [8, 2, 4, 10, 0]
Penniclin_G [16, 4, 2, 17, 1]


* Label paired with length = 3

In [None]:
def calc_labeled_pair_mapping(mol, mol_type):
    
    mol_lines = mol.splitlines()
    mol_counts_line = mol_lines[3]
    mol_num_atoms = int(mol_counts_line.split()[0])
    mol_atom_lines = mol_lines[4:4 + mol_num_atoms]
    mol_explicit = explicit_adjacency_matrix(mol_type, mol)
    
    L_mol = []
    for num_row in range(5):
        L_mol.append([])
        for num_col in range(len(mol_explicit)):
            L_mol[num_row].append(0)
    # CONHS
    num_non_hydrogen_atoms = 0
    for i, line in enumerate(mol_atom_lines):
        atom_type = str(line.split()[3])
        if atom_type != "H":
            num_non_hydrogen_atoms += 1
            if atom_type == "O":
                L_mol[1][i] = 1
            elif atom_type == "C":
                L_mol[0][i] = 1
            elif atom_type == "N":
                L_mol[2][i] = 1
            elif atom_type == "S":
                L_mol[4][i] = 1

    rows_H = len(mol_explicit) - num_non_hydrogen_atoms
    L_mol[3][num_non_hydrogen_atoms:num_non_hydrogen_atoms + rows_H] = [1] * rows_H
    
    
    L_mol = np.array(L_mol)
    L_mol_t = L_mol.T
    adjacency_matrix_length3 = np.array(mol_explicit) @ np.array(mol_explicit) @ np.array(mol_explicit)
    temp = L_mol @ adjacency_matrix_length3
    answer = temp @ L_mol_t
    return answer

pennG_labeled_pair = calc_labeled_pair_mapping(penicillinG_mol_file, True)
print("Label Paired with length = 3 for Penicillin G:\n", np.array(pennG_labeled_pair))
pennA_labeled_pair = calc_labeled_pair_mapping(penicillinA_mol_file, False)
print("Label Paired with length = 3 for Penicillin A:\n", np.array(pennA_labeled_pair))
caffeine_labeled_pair = calc_labeled_pair_mapping(caffeine_mol_file, False)
print("Label Paired with length = 3 for Caffeine:\n", np.array(caffeine_labeled_pair))


Label Paired with length = 3 for Penicillin G:
 [[218  19  34  99  14]
 [ 19   0   3   6   0]
 [ 34   3   4   8   2]
 [ 99   6   8  12   8]
 [ 14   0   2   8   0]]
Label Paired with length = 3 for Penicillin A:
 [[220  20  34 144  14]
 [ 20   0   3   9   0]
 [ 34   3   4   8   2]
 [144   9   8  56   8]
 [ 14   0   2   8   0]]
Label Paired with length = 3 for Caffeine:
 [[38 13 69 60  0]
 [13  0  1  0  0]
 [69  1  6  0  0]
 [60  0  0  0  0]
 [ 0  0  0  0  0]]


* Depth first search (all cycles, double traverse, no compression, depth = 2). This should be a binary (0/1) map

In [None]:
def DFS(mol_type, mol_file, atoms_order):
    l  = ['C', 'O', 'N', 'H', 'S']
    paths = {}                                                              # keys= possible paths, value = absence/presence (0 or 1)
    possible_paths = paths_key(l)                                           # get all possible paths
    adj_mat = explicit_adjacency_matrix(mol_type, mol_file)
    order = get_atoms_order(mol_type, mol_file)
    
    for p in possible_paths:
        atom_indices = {}
        for char in l:
            atom_indices[char] = []
        for i in range(len(order)):
            if order[i] == "Na":
                continue
            atom_indices[order[i]].append(i)

        indices_seqs = [] 
        for step in p:                                                     # p =["c","n","o"] indices_seqs = [[indices of C],[indices of N],[indices of O]] = [[2,4],[1,3],[5]]
            indices_seqs.append(atom_indices[step])
        
        cartesian_prods = cartesian_product(p, indices_seqs)
        
                                                                           # cartesian_product = [[2,4,5],[2,3,5],[4,1,5],[4,3,5]]
        for path in cartesian_prods:                                       # for first iteration, path =[2,4,5]
            # if one combination is present then we don't need to check for the other ones since we don't want count
                d = len(path) # d=  3
                val = 1
                for i in range(d-1): # i = 0, 1
                    if adj_mat[path[i]][path[i+1]] == 0:
                        # checking adj_mat[2][4] has 1 or 0 
                        val = 0
                if val == 1:
                    paths[str(p)] = 1
                    break
        # if no combination worked, then check if p exists in the final dictionary
        # if p does not exist in the final dictionary, then the path is absent.
        if not (str(p) in paths.keys()):
            paths[str(p)] = 0
    return paths


def get_atoms_order(mol_type, mol_file): 
    lines = mol_file.splitlines()
    counts_line = lines[3]
    num_atoms = int(counts_line.split()[0])
    num_bonds = int(counts_line.split()[1])
    atom_lines = lines[4:4 + num_atoms]
    atoms_order = []
    valence = {'N': 3, 'S': 2, 'C': 4, 'O': 2, 'H': 1}
    matrix = implicit_matrix(mol_file)
    num_H = 0
    for i, line in enumerate(atom_lines):
        atom_type = line.split()[3]
        atoms_order.append(atom_type)
        if mol_type == True:
            if atom_type == "Na":
                continue
            elif i == 3:
                continue
        row_num = i
        current_num_bonds = sum(matrix[row_num])
        implicit_hydrogens = valence[atom_type] - current_num_bonds
        num_H += implicit_hydrogens     
    atoms_order = atoms_order + ['H']*num_H
    return atoms_order

def paths_key(l):
    answer = []
    for i in l:
        for j in l:
            answer.append([i]+[j])
    for i in l:#"c->n->n" "cnn" ["c","n","n"]
        for j in l:
            for k in l:
                answer.append([i]+[j]+[k])
    return answer


def cartesian_product(path, indices):
    if len(path) == 2:
            cartesian_product = []
            for i in indices[0]:
                for j in indices[1]:
                    cartesian_product.append([i,j])
    elif len(path) == 3:
        cartesian_product = []
        for i in indices[0]:
            for j in indices[1]:
                for k in indices[2]:
                    cartesian_product.append([i,j, k])
    return cartesian_product

pennA_atoms_order = get_atoms_order(False, penicillinA_mol_file)
pennA_DFS = DFS(False, penicillinA_mol_file, pennA_atoms_order)
print("Depth First Search for Penicillin A\n:", list(pennA_DFS.values()))

pennG_atoms_order = get_atoms_order(True, penicillinG_mol_file)
pennG_DFS = DFS(True, penicillinG_mol_file, pennG_atoms_order)
print("Depth First Search for Penicillin G\n:", list(pennG_DFS.values()))

caffeine_atoms_order = get_atoms_order(False, caffeine_mol_file)
caffeine_DFS = DFS(False, caffeine_mol_file, caffeine_atoms_order)
print("Depth First Search for Caffeine \n:", list(caffeine_DFS.values()))

    

Depth First Search for Penicillin A
: [1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Depth First Search for Penicillin G
: [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Depth First Search for

### 4. Using your mappings compute the following kernels k(G1,G2) for all (3, 2) pairs of Penicillin A, G, and Caffeine

* Molecular formula

In [None]:
def molecular_formula_mapping(mol1_mol_formula, mol2_mol_formula):
    return np.dot(np.array(mol1_mol_formula), np.array(mol2_mol_formula))

print("k(G1,G2) using the molecular formula mapping for Penicillin A and Caffeine is ", molecular_formula_mapping(pennA_mol_formula, caffeine_mol_formula))
print("k(G1,G2) using the molecular formula mapping for Penicillin G and Caffeine is ", molecular_formula_mapping(pennG_mol_formula, caffeine_mol_formula))
print("k(G1,G2) using the molecular formula mapping for Penicillin A and Penicillin G is ", molecular_formula_mapping(pennA_mol_formula, pennG_mol_formula))

k(G1,G2) using the molecular formula mapping for Penicillin A and Caffeine is  404
k(G1,G2) using the molecular formula mapping for Penicillin G and Caffeine is  314
k(G1,G2) using the molecular formula mapping for Penicillin A and Penicillin G is  719


* Moleculr formula + MinMax

In [None]:
def mol_formula_min_max_mapping(mol1_mol_formula, mol2_mol_formula):
    min_vector = sum([min(els) for els in zip(mol1_mol_formula, mol2_mol_formula)])
    max_vector = sum([max(els) for els in zip(mol1_mol_formula, mol2_mol_formula)])
    return min_vector/max_vector
print("Molecular Formula + Min Max k(G1,G2) for Pennicilin A and Caffeine is ", mol_formula_min_max_mapping(pennA_mol_formula, caffeine_mol_formula))
print("Molecular Formula + Min Max k(G1,G2) for Pennicilin G and Caffeine is ", mol_formula_min_max_mapping(pennG_mol_formula, caffeine_mol_formula))
print("Molecular Formula + Min Max k(G1,G2) for Pennicilin A and Pennicilin G is ", mol_formula_min_max_mapping(pennA_mol_formula, pennG_mol_formula))


Molecular Formula + Min Max k(G1,G2) for Pennicilin A and Caffeine is  0.43137254901960786
Molecular Formula + Min Max k(G1,G2) for Pennicilin G and Caffeine is  0.5238095238095238
Molecular Formula + Min Max k(G1,G2) for Pennicilin A and Pennicilin G is  0.8163265306122449


* Label paired

In [None]:
def labeled_pair_mapping(mol1_labeled_pair, mol2_labeled_pair):
    mol1_LP = np.array(mol1_labeled_pair)
    mol1_LP = mol1_LP.flatten()
    mol2_LP = np.array(mol2_labeled_pair)
    mol2_LP = mol2_LP.flatten()
    mapping = np.dot(mol1_LP, mol2_LP)
    return mapping

print("The Label paired k(G1,G2) for Penicillin A and Caffeine is "  ,labeled_pair_mapping(pennA_labeled_pair,  caffeine_labeled_pair))
print("The Label paired k(G1,G2) for Penicillin G and Caffeine is "  ,labeled_pair_mapping(pennG_labeled_pair,  caffeine_labeled_pair))
print("The Label paired k(G1,G2) for Penicillin A and Penicillin G is "  ,labeled_pair_mapping(pennA_labeled_pair,  pennG_labeled_pair))

The Label paired k(G1,G2) for Penicillin A and Caffeine is  30882
The Label paired k(G1,G2) for Penicillin G and Caffeine is  25380
The Label paired k(G1,G2) for Penicillin A and Penicillin G is  81014


* Label paired + MinMax

In [None]:
def label_pair_min_max(mol1_labeled_pair, mol2_labeled_pair):
    mol1_LP = np.array(mol1_labeled_pair)
    mol1_LP = mol1_LP.flatten()
    mol2_LP = np.array(mol2_labeled_pair)
    mol2_LP = mol2_LP.flatten()
    min_vector = sum([min(els) for els in zip(mol1_LP, mol2_LP)])
    max_vector = sum([max(els) for els in zip(mol1_LP, mol2_LP)])
    mapping = min_vector/max_vector
    return mapping

print("Label Pair + Min Max k(G1,G2) for Pennicilin A and Caffeine is " ,label_pair_min_max(pennA_labeled_pair, caffeine_labeled_pair))
print("Label Pair + Min Max k(G1,G2) for Pennicilin G and Caffeine is " ,label_pair_min_max(pennG_labeled_pair, caffeine_labeled_pair))
print("Label Pair + Min Max k(G1,G2) for Pennicilin A and Pennicilin G is " ,label_pair_min_max(pennA_labeled_pair, pennG_labeled_pair))

Label Pair + Min Max k(G1,G2) for Pennicilin A and Caffeine is  0.30861244019138756
Label Pair + Min Max k(G1,G2) for Pennicilin G and Caffeine is  0.37283236994219654
Label Pair + Min Max k(G1,G2) for Pennicilin A and Pennicilin G is  0.8115183246073299


* Depth first search

In [None]:
def DFS_mapping(mol1_bit_vector, mol2_bit_vector):
    mol1_BV = list(mol1_bit_vector.values())
    mol2_BV = list(mol2_bit_vector.values())
    return np.dot(mol1_BV, mol2_BV)

print("Depth First Search k(G1,G2) for Pennicilin A and Caffeine is " , DFS_mapping(pennA_DFS, caffeine_DFS))
print("Depth First Search k(G1,G2) for Pennicilin G and Caffeine is " , DFS_mapping(pennG_DFS, caffeine_DFS))
print("Depth First Search k(G1,G2) for Pennicilin A and Pennicilin G is " , DFS_mapping(pennA_DFS, pennG_DFS))

Depth First Search k(G1,G2) for Pennicilin A and Caffeine is  22
Depth First Search k(G1,G2) for Pennicilin G and Caffeine is  22
Depth First Search k(G1,G2) for Pennicilin A and Pennicilin G is  40


* Depth first search + Tanimoto

In [None]:
def DFS_tanimoto(mol1_bit_vector, mol2_bit_vector):
    mol1_BV = list(mol1_bit_vector.values())
    mol2_BV = list(mol2_bit_vector.values())
    u_u = sum(mol1_BV)
    v_v = sum(mol2_BV)
    u_v = u_v = sum([1 if mol1_BV[i] == 1 and mol2_BV[i] == 1 else 0 for i in range(len(mol1_BV))])
    denominator = u_u + v_v - u_v
    tanimoto_sim = u_v / denominator
    return tanimoto_sim
print("DFS + Tanimoto Similarity k(G1,G2) for Pennicilin A and Caffeine is ",DFS_tanimoto(pennA_DFS, caffeine_DFS))
print("DFS + Tanimoto Similarity k(G1,G2) for Pennicilin G and Caffeine is ",DFS_tanimoto(pennG_DFS, caffeine_DFS))
print("DFS + Tanimoto Similarity k(G1,G2) for Pennicilin A and Pennicilin G is ",DFS_tanimoto(pennA_DFS, pennG_DFS))

DFS + Tanimoto Similarity k(G1,G2) for Pennicilin A and Caffeine is  0.4782608695652174
DFS + Tanimoto Similarity k(G1,G2) for Pennicilin G and Caffeine is  0.55
DFS + Tanimoto Similarity k(G1,G2) for Pennicilin A and Pennicilin G is  0.8695652173913043
