In [1]:
# Import necessary libraries
from rdkit import Chem
from rdkit.Chem import Draw, AllChem
import pandas as pd
import sqlite3
from rdkit.DataStructs import ExplicitBitVect, TanimotoSimilarity

# Function to create the database and table
def create_database(db_name):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Chemicals (
            id INTEGER PRIMARY KEY,
            SMILES TEXT,
            MolBlock TEXT,
            Fingerprint BLOB
        )
    """)
    conn.commit()
    conn.close()

# Function to insert chemical data into the database
def insert_chemical_data(db_name, smiles_list):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            mol_block = Chem.MolToMolBlock(mol)
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            fingerprint = fingerprint.ToBinary()
            cursor.execute("INSERT INTO Chemicals (SMILES, MolBlock, Fingerprint) VALUES (?, ?, ?)", (smiles, mol_block, fingerprint))
    
    conn.commit()
    conn.close()

# Function to perform substructure search
def substructure_search(db_name, substructure_smarts):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    substructure = Chem.MolFromSmarts(substructure_smarts)
    if not substructure:
        raise ValueError("Invalid substructure SMARTS.")

    cursor.execute("SELECT id, SMILES, MolBlock, Fingerprint FROM Chemicals")
    results = cursor.fetchall()

    matched_molecules = []
    for row in results:
        mol = Chem.MolFromMolBlock(row[2])
        if mol and mol.HasSubstructMatch(substructure):
            matched_molecules.append((row[0], row[1]))

    conn.close()
    return matched_molecules

# Load SMILES from a CSV file
csv_file = 'molecules.csv'  # Update with your CSV file path
df = pd.read_csv(csv_file)
smiles_list = df['SMILES'].tolist()

# Create the database and insert chemical data
db_name = 'chemical_substructure_database.db'
create_database(db_name)
insert_chemical_data(db_name, smiles_list)

print("Database created and chemical data inserted successfully.")

# Example substructure search
substructure_smarts = 'c1ccccc1'  # Benzene ring
matched_molecules = substructure_search(db_name, substructure_smarts)
print("Matched Molecules:", matched_molecules)

# Function to save results to CSV
def save_results_to_csv(results, output_file):
    df = pd.DataFrame(results, columns=["ID", "SMILES"])
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Save substructure search results to CSV
output_csv_file = 'substructure_search_results.csv'
save_results_to_csv(matched_molecules, output_csv_file)


Database created and chemical data inserted successfully.
Matched Molecules: [(1, 'COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4'), (3, 'CC1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O'), (5, 'COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c3ccc(Cl)cc3'), (8, 'OC(=O)C1=CN(C2CC2)c3cc(N4CCNCC4)c(F)cc3C1=O'), (9, 'CCN1C=C(C(=O)O)C(=O)c2cc(F)c(cc12)N3CCNCC3'), (10, 'CC(N)Cc1ccccc1'), (12, 'CN1CCN2C(C1)c3ccccc3Cc4ccccc24'), (13, 'CN(C)CCCN1c2ccccc2CCc3ccccc13'), (14, 'CC1Cc2ccccc2N1NC(=O)c3ccc(Cl)c(c3)S(=O)(=O)N'), (15, 'CCOC(=O)c1ncn2c1CN(C)C(=O)c3cc(F)ccc23'), (16, 'Cc1c(C)c2OC(C)(COc3ccc(CC4SC(=O)NC4=O)cc3)CCc2c(C)c1O'), (17, 'CC(O)(CS(=O)(=O)c1ccc(F)cc1)C(=O)Nc2ccc(C#N)c(c2)C(F)(F)F'), (18, 'CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13'), (19, 'COCCc1ccc(OCC(O)CNC(C)C)cc1'), (20, 'CC\\C(=C(\\CC)/c1ccc(O)cc1)\\c2ccc(O)cc2'), (23, 'OC1=NC(=O)C(N1)(c2ccccc2)c3ccccc3'), (26, 'CN(C)CCCN1c2ccccc2CCc3ccc(Cl)cc13'), (27, 'COc1c2OC(=O)C=Cc2cc3ccoc13'), (28, 'NS(=O)(=O)c1cc(Cl)c(Cl)c(c1)S(=O)(=O)N'), (29, 'CCOc1ccc2