In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import os
from Functions import *

from rdkit import Chem
from rdkit.Chem import Descriptors
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Lipinski as l
import rdkit.Chem.Fragments as f
from rdkit.Chem import AllChem



#ignore warnings
import warnings
warnings.filterwarnings('ignore')



In [2]:
dir_path = os.getcwd()
test_path = os.path.join(dir_path, "datasets", "test_smiles.csv")

In [3]:
orig_test_data = pd.read_csv(test_path, index_col="INDEX")

FileNotFoundError: [Errno 2] No such file or directory: '/home/nour/Bureau/Programming for data science /Programing_for_data_science/Assignment4/datasets/test_smiles.csv'

In [None]:
dups = orig_test_data[orig_test_data.duplicated(keep=False)].sort_values("SMILES")
#dups

In [None]:
print(f"Original test data shape: {orig_test_data.shape}")

# Check for duplicates
n_duplicates = orig_test_data.duplicated(subset='SMILES').sum()
print(f"Found {n_duplicates} duplicate SMILES in test data")

# Remove duplicates (keep first occurrence)
test_data_clean = orig_test_data.drop_duplicates(subset='SMILES', keep='first')

print(f"After removing duplicates: {test_data_clean.shape}")
print(f"Removed {len(orig_test_data) - len(test_data_clean)} duplicate rows")

In [None]:
pd.DataFrame({'Null Values':test_data_clean.isnull().sum()})

## Feature selection 

In [None]:
test_data_clean.shape, test_data_clean.columns, test_data_clean.head()

In [None]:
def smile_to_descriptors(smiles):
    fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=124)
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None
    
    fp = fpgen.GetFingerprint(mol)
    features = [mol.GetNumAtoms(), d.CalcExactMolWt(mol), f.fr_Al_COO(mol), l.HeavyAtomCount(mol), l.NumHDonors(mol), l.NumHAcceptors(mol)]
    extracted_features = np.concatenate([features, fp])
    return extracted_features

columns_names = ["NoAtoms", "CalcExactMolWt", "fr_Al_COO", "HeavyAtomCount", "NumHDonors", "NumHAcceptors"]
for i in range(124):
    columns_names.append(f"MFp_{i}")

descriptors = pd.DataFrame(columns=columns_names)

for i, smiles in enumerate(test_data_clean["SMILES"]):
    print(i)
    descriptors.loc[len(descriptors)] = smile_to_descriptors(smiles)

# FIX: Don't pass index parameter
df = descriptors.copy()

# Check nulls
print(pd.DataFrame({'Null Values': df.isnull().sum()}))

In [None]:
df.head()

In [None]:
# Check what smile_to_descriptors is returning
test_smiles = test_data_clean["SMILES"].iloc[0]
print(f"Testing first SMILES: {test_smiles}")

result = smile_to_descriptors(test_smiles)
print(f"Result: {result}")
print(f"Result type: {type(result)}")
print(f"Result is None: {result is None}")

# Check if descriptors dataframe is actually being filled
print(f"\nDescriptors shape: {descriptors.shape}")
print(f"Descriptors head:\n{descriptors.head()}")
print(f"Descriptors dtypes:\n{descriptors.dtypes}")

# Check the df you created
print(f"\ndf shape: {df.shape}")
print(f"df head:\n{df.head()}")
print(f"Null values:\n{df.isnull().sum()}")