## Summary
This notebook reworks code Shanna wrote to process the screening data. It does the following:

1. Load data from DrugBank, ReFRAME, and Broad
2. Creates RDkit molecules from the SMILES, standardizes and sanitizes the molecules by removing salts
3. Calculate Morgan Fingerprints.
4. Save fingerints, molecule names, and source dataset.

In [None]:
import numpy as np
import pandas as pd
import rdkit

from molvs import Standardizer
from rdkit.Chem import PandasTools, SaltRemover, rdMolDescriptors, MolFromSmiles
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')  # suppresses annoying RDKIT errors and warnings

from tqdm import tqdm
tqdm.pandas()

### 1. Load Data

In [None]:
drugbank = PandasTools.LoadSDF('../data/screening_data/drugbank.sdf') #auto-sanitize function; don't need to do again
reframe = pd.read_csv('../data/screening_data/reframe.csv', encoding='latin1')
broad = pd.read_csv('../data/screening_data/broad.csv', delimiter="\t")
print('len(drugbank) =', len(drugbank))
print('len(reframe) =', len(reframe))
print('len(broad) =', len(broad))

In [None]:
# combine into one dataframe
screening_data = pd.DataFrame(columns=['source', 'name', 'smiles'])
screening_data.source = ['drugbank']*len(drugbank) + ['reframe']*len(reframe) + ['broad']*len(broad)
screening_data.name = pd.concat([drugbank.GENERIC_NAME, reframe.Name, broad.pert_iname], ignore_index=True)
screening_data.smiles = pd.concat([drugbank.SMILES, reframe.SMILES, broad.smiles], ignore_index=True)

print(f"Dropping {screening_data['smiles'].isna().sum()} rows with missing SMILES")
screening_data.dropna(inplace=True)

### 2. Create, standardize and sanitize molecules

In [None]:
screening_data['rdkit_mol'] = screening_data['smiles'].progress_apply(MolFromSmiles)
print(f"Dropping {screening_data['rdkit_mol'].isna().sum()} rows which failed molecule creation")
screening_data.dropna(inplace=True)

In [None]:
# standardize molecules
screening_data['rdkit_mol'] = screening_data['rdkit_mol'].progress_apply(Standardizer().standardize)

In [None]:
# remove salts
screening_data['rdkit_mol'] = screening_data['rdkit_mol'].progress_apply(SaltRemover.SaltRemover().StripMol)

### 3. Calculate Morgan Fingerprints

In [None]:
def calculate_morgan_fingerprint(mol):
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, useChirality=True)
    bit_string = fp.ToBitString()
    return np.array([int(char) for char in bit_string], dtype=np.uint8)
screening_data['morgan_fingerprint'] = screening_data['rdkit_mol'].progress_apply(calculate_morgan_fingerprint)

### 4. Save Results

In [None]:
assert not screening_data.isna().values.any()  # confirm clean data
screening_data.drop(columns=['smiles', 'rdkit_mol']).to_pickle('../processed_data/screening_data_processed.pkl')