# Molecular Fingerprint Generation

This code assumes that RDKit python module is already installed.

In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols

You can generate Mol objects from SMILES and other representations.  They need to be downloaded from PubChem.

In [2]:
ms = [Chem.MolFromSmiles('C1=CC=CC=C1'), 
      Chem.MolFromSmiles('c1ccccc1'), 
      Chem.MolFromSmiles('C1CCCCC1'),
      Chem.MolFromSmiles('C1=CC=C(C=C1)C=O'),
      Chem.MolFromSmiles('CC(C)C1=C(C(=C(N1CC[C@H](C[C@H](CC(=O)O)O)O)C2=CC=C(C=C2)F)C3=CC=CC=C3)C(=O)NC4=CC=CC=C4')]

# MACCS Keys

See https://www.rdkit.org/docs/GettingStartedInPython.html#maccs-keys

Ignore the very first bit (bit 0).  See https://github.com/rdkit/rdkit/issues/1726

In [3]:
fps = [ MACCSkeys.GenMACCSKeys(x) for x in ms ]
fpbitstr167 = fps[0].ToBitString()
fpbitstr166 = fpbitstr167[1:]
print( len(fpbitstr167), "vs.", len(fpbitstr166) )
print( fpbitstr166 )

167 vs. 166
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011010


Read about the data type ExplicitBitVect
<br>
https://www.rdkit.org/docs/source/rdkit.DataStructs.cDataStructs.html?highlight=rdkit%20datastructs%20cdatastructs%20explicitbitvect#rdkit.DataStructs.cDataStructs.ExplicitBitVect

In [4]:
# Just for checking 
print(fps[3].GetNumBits())
print(fps[3].GetNumOffBits())
print(fps[3].GetNumOnBits())
print(fps[3].ToBinary())
print(fps[3].ToBitString())

167
162
5
b'\xe0\xff\xff\xff\xa7\x00\x00\x00\x05\x00\x00\x00i\x00\x0e\x00\x00\x00\x02'
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000011110


The above code can be reduced to one line.

In [5]:
fps = [ MACCSkeys.GenMACCSKeys(x).ToBitString()[1:] for x in ms ]
print(len(fps[0]))
print(fps[0])

166
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011010


# Topological Fingerprint (Daylight Analogue)

Read about Topolotical Finerprint. <br>
https://www.rdkit.org/docs/GettingStartedInPython.html#topological-fingerprints
<br>
Do not use FingeprintMol() used in the above document.  Instead, use rdmolops.RDKFingerprint() as recommended in it.
<br>
https://www.rdkit.org/docs/source/rdkit.Chem.rdmolops.html#rdkit.Chem.rdmolops.RDKFingerprint

In [6]:
from rdkit.Chem import rdmolops
fps = [rdmolops.RDKFingerprint(x, fpSize=2048, minPath=1, maxPath=7).ToBitString() for x in ms]
print(len(fps[0]))
print(fps[0])

2048
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

# Morgan Fingerprint (ECFP)

Read about the circular fingerprint (ECFP/FCFP) 
<br> 
https://www.rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints
<br>
From the manual, ...
<br>
"When comparing the ECFP/FCFP fingerprints and the Morgan fingerprints generated by the RDKit, remember that the 4 in ECFP4 corresponds to the diameter of the atom environments considered, while the Morgan fingerprints take a radius parameter. So the examples [in the manual], with radius=2, are roughly equivalent to ECFP4 and FCFP4."

In [7]:
from rdkit.Chem import AllChem
fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024).ToBitString() for x in ms]
print(len(fps[0]))
print(fps[0])

1024
00000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

# Morgan Fingerprint (FCFP)

In [8]:
from rdkit.Chem import AllChem
fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024,useFeatures=True).ToBitString() for x in ms]
print(len(fps[0]))
print(fps[0])

1024
00001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010

# PubChem FP Decoding

Read the last page of the PubChem Fingerprint specification
<br>
ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.pdf
<br>
"Decoding PubChem Fingerprints
PubChem fingerprints are currently 881 bits in length. Binary data is stored in
one byte increments. The fingerprint is, therefore, 111 bytes in length (888
bits), which includes padding of seven bits at the end to complete the last byte.
A four-byte prefix, containing the bit length of the fingerprint (881 bits),
increases the stored PubChem fingerprint size to 115 bytes (920 bits).
When PubChem fingerprints are encoded in base64 format, the base64-encoded
fingerprints are 156 bytes in length. The last two bytes are padding so that the
base64 length is divisible by four (156 bytes – 2 bytes = 154 bytes). Each base64
byte encodes six binary bits (154 bytes * 6 bits/byte = 924 bits). The last four
bits are padding to complete the last base64 byte (924 bits – 4 bits = 920 bits).
The resulting 920 binary bits (115 bytes) are described in the previous paragraph."


In [3]:
from base64 import b64decode

def PCFP_BitString(pcfp_base64) :

    pcfp_bitstring = "".join( ["{:08b}".format(x) for x in b64decode( pcfp_base64 )] )[32:913]
    return pcfp_bitstring
    

In [4]:
pcfps = [ 'AAADcYBgAAAAAAAAAAAAAAAAAAAAAAAAAAAwAAAAAAAAAAABAAAAGAAAAAAACACAEAAwAIAAAACAACBCAAACAAAgAAAIiAAAAIgIICKAERCAIAAggAAIiAcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==',
          'AAADcfB/OQAAAAAAAAAAAAAAAAAAAWAAAAAwYMAAAAAAAAAB1AAAHwAQCAAADRzhng4yyJPMEgCoAyXyXACCgCAlAiAI2CE4ZNgIMP7A1ZGEYYhntADIyUec/ObOgAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==']

In [5]:
print( len(PCFP_BitString(pcfps[0])) )
print(PCFP_BitString(pcfps[0]))

881
10000000011000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000001100000000000000000000000000000000000000000000000100000000000100000000001000000000000001100000000000010000000000000000000000000000000100000000000000000100000010000100000000000000000000000100000000000000000001000000000000000000000000010001000100000000000000000000000000010001000000010000010000000100010100000000001000100010000100000000010000000000000001000001000000000000000000010001000100000000111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


# Atom Pairs

Converting this fingerprint into a bitstring results in a very long string, which makes the process very inefficient.  Do not try this.

In [12]:
from rdkit.Chem.AtomPairs import Pairs
fps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms]

In [13]:
print(fps[0])
print(fps[0].GetNumBits())
print(fps[0].GetNumOffBits())
print(fps[0].GetNumOnBits())
print(fps[0].ToBinary())

<rdkit.DataStructs.cDataStructs.SparseBitVect object at 0x0000016CF5DA8BC8>
8388608
8388605
3
b'\xe0\xff\xff\xff\x00\x00\x80\x00\x03\x00\x00\x00\x0b&R\x00\x00\xe7\xd1\xa9\x02'


In [14]:
#NotebookApp.iopub_data_rate_limit=10000000.0
print(fps[0].ToBitString())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

