# This Notebook demonstrates a Python function for creating Morgan fingerprints in a dataframe containing a SMILES column.

### Load libraries

In [33]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

### Load data and create dataframe

In [34]:
# Load dataframe from CSV. 
data = pd.read_csv(
    'https://raw.githubusercontent.com/cptlab/ASCEPT_2023_comptox_workshop/main/W109_Machine_learning_QSAR_for_toxicity_prediction/data/smiles_cas_N6512_corrected.smi',
    names=['SMILES', 'CASRN', 'Ames'],
    delimiter='\t'
)

### Create function to make fps column from SMILES column

In [35]:
import pandas as pd
import numpy as np

def generate_fingerprint_column_in_df(data, radius=2, fp_length=2048):
    """
    Generates Morgan fingerprints from a pandas dataframe containing SMILES strings.

    Args:
        data (DataFrame): DataFrame with 'SMILES' column.
        radius (int, optional): Radius of the fingerprints. Defaults to 2.
        fp_length (int, optional): Length of the fingerprints. Defaults to 2048.

    Returns:
        Inserts "fps" column into dataframe containing fingerprints.
    """

    smiles_list = data['SMILES'].tolist()

    fingerprints = []
    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                # Handle invalid SMILES gracefully
                fingerprints.append(np.nan)
            else:
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=fp_length)
                fingerprints.append(fp)
        except ValueError:
            # Handle invalid SMILES gracefully
            fingerprints.append(np.nan)

    data['fps'] = fingerprints




### Run function on a dataframe

In [36]:
generate_fingerprint_column_in_df(data)
data.head(6)

Unnamed: 0,SMILES,CASRN,Ames,fps
0,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,2475-33-4,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,NNC(=O)CNC(=O)\C=N#N,820-75-7,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,O=C1NC(=O)\C(=N#N)\C=N1,2435-76-9,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,NC(=O)CNC(=O)\C=N#N,817-99-2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CCCCN(CC(O)C1=C\C(=N#N)\C(=O)C=C1)N=O,116539-70-9,1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,NC(COC(=O)\C=N#N)C(=O)O,115-02-6,1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Test error handling

In [37]:
# Generate test df with error to test error handling.
data2 = data
# Change a single value in column 'SMILES' at row index 2
data2.at[2, 'SMILES'] = "CC9"
data2.head(3)

Unnamed: 0,SMILES,CASRN,Ames,fps
0,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,2475-33-4,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,NNC(=O)CNC(=O)\C=N#N,820-75-7,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CC9,2435-76-9,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [38]:
generate_fingerprint_column_in_df(data2)

[11:02:23] SMILES Parse Error: unclosed ring for input: 'CC9'


In [39]:
data2.head(3)

Unnamed: 0,SMILES,CASRN,Ames,fps
0,O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7cccc...,2475-33-4,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,NNC(=O)CNC(=O)\C=N#N,820-75-7,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CC9,2435-76-9,1,
