## KRASi data preprocessing

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('all.csv')

In [None]:
df.ID.str.split('_').str[0].value_counts()

US           425
azaq         365
tetra        220
quina        161
646          111
31678         61
paper         34
625           29
BI-2865        1
precursor      1
BI-0474        1
Sotorasib      1
Name: ID, dtype: int64

In [None]:
df['ID'] = df.ID.replace({'BI-2865':'nature_BI-2865',
                      'precursor_1':'nature_precursor1',
                      'BI-0474':'nature_BI-0474',
                      'Sotorasib':'nature_Sotorasib'})

In [None]:
df['ID'] = df.ID.str.replace("^31678_", "678_", regex=True)

In [None]:
df.ID.str.split('_').str[0].value_counts()

US        425
azaq      365
tetra     220
quina     161
646       111
678        61
paper      34
625        29
nature      4
Name: ID, dtype: int64

In [None]:
df['source'] = df.ID.str.split('_').str[0]

## Remove similar compounds, and average their target

In [None]:
!pip install rdkit -qq

Collecting rdkit
  Downloading rdkit-2023.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.4


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

In [None]:
df.columns

Index(['ID', 'SMILES', 'Kd', 'IC50', 'erk_IC50', 'source'], dtype='object')

In [None]:
out = tanimoto(df,target_col = 'IC50')

In [None]:
out.loc[out.SimilarityScore>0.9].to_csv('ss.csv',index=False)

In [None]:
def tanimoto(df, #pandas DataFrame with SMILES and ID columns.
             smiles_col='SMILES', #name of the column containing the SMILES strings (default: 'SMILES').
             id_col='ID', #name of the column containing the molecule IDs (default: 'ID').
             target_col=None, #name of the column containing the target values (default: None).
             radius=2, #radius of the Morgan fingerprint (default: 2).
            ):
    """
    Calculates the Tanimoto similarity scores between all pairs of molecules in a pandas DataFrame.

    """
    df = df.copy()
    # Convert SMILES to molecule objects
    df['Molecule'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(x))

    # Calculate fingerprints
    df['Fingerprint'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius))

    # Calculate similarity scores
    similarity_scores = []
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            sim_score = DataStructs.TanimotoSimilarity(df['Fingerprint'][i], df['Fingerprint'][j])
            if target_col is not None:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score, df[target_col][i], df[target_col][j]))
            else:
                similarity_scores.append((df[id_col][i], df[id_col][j], df[smiles_col][i], df[smiles_col][j], sim_score))

    # Create a new DataFrame with the similarity scores
    if target_col is not None:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore', 'Target1', 'Target2'])
    else:
        result_df = pd.DataFrame(similarity_scores, columns=['ID1', 'ID2', 'SMILES1', 'SMILES2', 'SimilarityScore'])

    # Sort by similarity score in descending order
    result_df.sort_values('SimilarityScore', ascending=False, inplace=True)
    result_df = result_df.reset_index(drop=True)

    return result_df

In [None]:
from rdkit.Chem.Scaffolds import MurckoScaffold

In [None]:
# Generate scaffolds
def generate_scaffold(smiles, include_chirality=False):
    mol = Chem.MolFromSmiles(smiles)
    scaffold = MurckoScaffold.MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
    return scaffold



In [None]:
df['Scaffold'] = df['SMILES'].apply(lambda x: generate_scaffold(x))

In [None]:
df['Scaffold'].value_counts()

c1ccc2c(-c3cc4nc(OCC56CCCN5CCC6)nc(N5CC6CCC(C5)N6)c4cn3)cccc2c1    103
c1ccc(-c2cc3nc(OCC45CCCN4CCC5)nc(N4CC5CCC(C4)N5)c3cn2)cc1           70
c1ccc2c(-c3cc4nc(OCC56CCCN5CCC6)nc(N5CCCCC5)c4cn3)cccc2c1           70
c1ccc2c(-c3cc4nc(OCC5CCCN5)nc(N5CC6CCC(C5)N6)c4cn3)cccc2c1          45
c1ccc2c(N3CCc4c(nc(OCC5CCCN5)nc4N4CC5CCC(C4)N5)C3)cccc2c1           40
                                                                  ... 
c1ccc2c(-c3ccc4c(N5CCNCC5)nc(OCC5CCCN5)nc4c3)cccc2c1                 1
c1ccc2c(-c3cc4nc(OCC56CCCN5CCC6)nc(Nc5ccon5)c4cn3)cccc2c1            1
c1cc(OCC2CCCN2)nc(-c2noc(C3CCCc4sccc43)n2)n1                         1
c1ccc(C2CC2)c(-c2cc3nc(OCC45CCCN4CCC5)nc(NCC4CCC4)c3cn2)c1           1
c1ccc(COc2nc(N3CC4CCC(C3)N4)c3cnc(-c4cccc5ccccc45)cc3n2)cc1          1
Name: Scaffold, Length: 490, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
scaffold_series = df['Scaffold']
scaffolds = scaffold_series.unique()
train_scaffolds, test_scaffolds = train_test_split(scaffolds, test_size=0.2, random_state=42)


In [None]:
def in_scaffold_set(scaffold, scaffold_set):
    return scaffold in scaffold_set

# Assign molecules to the train or test set based on their scaffold
df['Set'] = df.apply(lambda x: 'Train' if in_scaffold_set(x['Scaffold'], train_scaffolds) else 'Test', axis=1)


In [None]:
out = tanimoto(df,target_col = 'Set')

In [None]:
out.loc[out.SimilarityScore>0.9].to_csv('ss2.csv',index=False)