In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem

# Load the data
bitter_props = pd.read_csv("Data\BitterDB\BitterCompoundsPropA_2024.csv")

# Drop unnecessary columns
columns_to_drop = ['comment', 'related_pubChemID', 'more_identifiers', 'related_bitterCompounds', 'IUPharLink']
bitter_props = bitter_props.drop(columns=columns_to_drop, errors='ignore')

# Convert Num_Rings to numeric
bitter_props['Num_Rings'] = pd.to_numeric(bitter_props['Num_Rings'], errors='coerce')

# Impute missing values for numeric columns
numeric_columns = ['MW', 'AlogP', 'Num_AromaticRings', 'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'SensoryThr', 'Human', 'Chicken', 'Mouse', 'Cat', 'Num_Rings']
for col in numeric_columns:
    bitter_props[col] = bitter_props[col].fillna(bitter_props[col].median())

# Drop rows with missing values in crucial columns
crucial_columns = ['canonical_smiles', 'Isomeric_smiles', 'cFormula', 'InChiKey']
bitter_props = bitter_props.dropna(subset=crucial_columns)

# Validate SMILES
def validate_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

bitter_props['valid_smiles'] = bitter_props['canonical_smiles'].apply(validate_smiles)
bitter_props = bitter_props[bitter_props['valid_smiles']]

# Drop the temporary 'valid_smiles' column
bitter_props = bitter_props.drop(columns=['valid_smiles'])

# # Save the cleaned dataset
# bitter_props.to_csv("BitterCompoundsPropA_2024_Cleaned.csv", index=False)
# print(f"Cleaned dataset saved with {len(bitter_props)} compounds.")

# # Print summary of remaining missing values
# print(bitter_props.isnull().sum())




In [4]:
bitter_props.isna().sum()

cid                      0
cStruc                   0
canonical_smiles         0
Isomeric_smiles          0
pubChemID              413
IUPAC                  489
cFormula                 0
InChiKey                 0
MW                       0
AlogP                    0
Num_AromaticRings        0
Num_H_Acceptors          0
Num_H_Donors             0
Num_RotatableBonds       0
Num_Rings                0
isNatural             1886
ExpVal                1742
SensoryThr               0
toxicity_val             0
pubChemSid            2144
AminoAcidSeq          1759
Cas_Number_Final      1462
Human                    0
Chicken                  0
Mouse                    0
Cat                      0
dtype: int64

In [5]:
bitter_props.head()

Unnamed: 0,cid,cStruc,canonical_smiles,Isomeric_smiles,pubChemID,IUPAC,cFormula,InChiKey,MW,AlogP,...,ExpVal,SensoryThr,toxicity_val,pubChemSid,AminoAcidSeq,Cas_Number_Final,Human,Chicken,Mouse,Cat
0,7,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4C=C)O.OS(...,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4C=C)O.OS(...,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4C=C)O.OS(...,11069.0,(5-ethenyl-1-azabicyclo[2.2.2]octan-2-yl)-(6-m...,C20H26N2O6S,AKYHKWQPZHDOBW-UHFFFAOYSA-N,422.5,2.5204,...,,1,0.0,154368.0,,549-56-4,1,1,0,0
1,9,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CC[NH+]3CC4C=C)O,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CC[NH+]3CC4C=C)O,COC1=CC2=C(C=CN=C2C=C1)[C@H]([C@@H]3C[C@H]4CC[...,11920271.0,"(R)-[(2S,4R,5R)-5-ethenyl-1-azoniabicyclo[2.2....",C20H25N2O2+,LOUPRKONTZGTKE-FOEVPDMQSA-O,325.43,1.7561,...,,0,0.0,,,,1,0,0,0
2,11,CC(C)CC(=O)C1=C(C(C(=O)C(C1=O)CC=C(C)C)(CC=C(C...,CC(C)CC(=O)C1=C(C(C(=O)C(C1=O)CC=C(C)C)(CC=C(C...,CC(C)CC(=O)C1=C(C(C(=O)C(C1=O)CC=C(C)C)(CC=C(C...,160467.0,"5,6-dihydroxy-4-(3-methylbutanoyl)-2,6-bis(3-m...",C21H30O5,IEHWDPKFDXJDJL-UHFFFAOYSA-N,362.47,3.6254,...,,0,0.0,,,"469-02-3,54378-50-6,54378-86-8",1,0,0,0
3,14,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4C=C)O.COC...,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4C=C)O.COC...,COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4C=C)O.COC...,23424040.0,(5-ethenyl-1-azabicyclo[2.2.2]octan-2-yl)-(6-m...,C40H50N4O8S,RONWGALEIBILOG-UHFFFAOYSA-N,746.93,5.6936,...,,0,0.0,197631.0,,6119-70-6,1,1,0,0
4,15,C1=CC=C(C=C1)C=O,C1=CC=C(C=C1)C=O,C1=CC=C(C=C1)C=O,240.0,benzaldehyde,C7H6O,HUMNYLRZRPPJDN-UHFFFAOYSA-N,106.12,1.4991,...,,0,1300.0,,,100-52-7,1,0,0,0


In [6]:
columns_to_remove = ['pubChemID', 'isNatural', 'ExpVal', 'AminoAcidSeq', 'Cas_Number_Final', 'pubChemSid', 'Human', 'Chicken', 'Mouse', 'Cat']
bitter_props = bitter_props.drop(columns=columns_to_remove, errors='ignore')

In [7]:
bitter_props.columns

Index(['cid', 'cStruc', 'canonical_smiles', 'Isomeric_smiles', 'IUPAC',
       'cFormula', 'InChiKey', 'MW', 'AlogP', 'Num_AromaticRings',
       'Num_H_Acceptors', 'Num_H_Donors', 'Num_RotatableBonds', 'Num_Rings',
       'SensoryThr', 'toxicity_val'],
      dtype='object')

In [8]:
bitter_props.to_csv("BitterCompoundsPropA_2024_Cleaned.csv", index=False)
print(f"Cleaned dataset saved with {len(bitter_props)} compounds.")

# Print summary of remaining missing values
print(bitter_props.isnull().sum())

Cleaned dataset saved with 2250 compounds.
cid                     0
cStruc                  0
canonical_smiles        0
Isomeric_smiles         0
IUPAC                 489
cFormula                0
InChiKey                0
MW                      0
AlogP                   0
Num_AromaticRings       0
Num_H_Acceptors         0
Num_H_Donors            0
Num_RotatableBonds      0
Num_Rings               0
SensoryThr              0
toxicity_val            0
dtype: int64
