In [9]:
import pandas as pd
from rdkit import Chem

df = pd.read_csv('DES_TMELT.csv') #Reading the database without repetition

df = df.rename (columns = {"NÂ°C" : "Number of components",
                           "DES_T" : "Type of DES",
                           "NAME_HBA": "Component#1",
                           "NAME_HBD": "Component#2",
                           "SMILES_HBA": "Smiles#1",
                           "SMILES_HBD": "Smiles#2",
                           "Tmelt_HBA": "T#1",
                           "Tmelt_HBD": "T#2",
                           "X_HBA": "X#1 (molar fraction)",
                           "X_HBD": "X#2 (molar fraction)",
                           "Mixed_Tmelt ": "Tmelt, K",
                           "REF#DOI": "Reference (DOI)"})
                           

def is_valid_smiles(smiles):
    try:
        return Chem.MolFromSmiles(smiles) is not None
    except:
        return False

# Applica la funzione su entrambe le colonne
df['valid_smiles_1'] = df['Smiles#1'].apply(is_valid_smiles)
df['valid_smiles_2'] = df['Smiles#2'].apply(is_valid_smiles)

# Filtra solo le righe con SMILES validi in entrambe le colonne
df_valid = df[df['valid_smiles_1'] & df['valid_smiles_2']]

# Mostra le righe non valide, se vuoi ispezionarle
df_invalid = df[~(df['valid_smiles_1'] & df['valid_smiles_2'])]

[09:38:19] SMILES Parse Error: unclosed ring for input: 'C1=CC=C2C(=C1)C=CC(=O)O3'
[09:38:19] SMILES Parse Error: unclosed ring for input: 'C1=CC=C2C(=C1)C=CC(=O)O4'
[09:38:19] SMILES Parse Error: unclosed ring for input: 'C1=CC=C2C(=C1)C=CC(=O)O5'


In [10]:
# Numero SMILES non validi in 'Smiles#1'
num_invalid_1 = (~df['valid_smiles_1']).sum()

# Numero SMILES non validi in 'Smiles#2'
num_invalid_2 = (~df['valid_smiles_2']).sum()

# Numero righe con almeno un SMILES non valido
num_invalid_either = (~(df['valid_smiles_1'] & df['valid_smiles_2'])).sum()

print(f"SMILES non validi in Smiles#1: {num_invalid_1}")
print(f"SMILES non validi in Smiles#2: {num_invalid_2}")
print(f"Righe con almeno un SMILES non valido: {num_invalid_either}")

SMILES non validi in Smiles#1: 0
SMILES non validi in Smiles#2: 3
Righe con almeno un SMILES non valido: 3


In [11]:
df_valid = df_valid.drop(columns = ["valid_smiles_1", "valid_smiles_2"])

In [12]:
df_valid.to_csv("DES_TMELT_V.csv", index=False)

In [13]:
print(df.columns.tolist())

['Unnamed: 0', 'Number of components', 'Type of DES', 'Component#1', 'Smiles#1', 'T#1', 'Component#2', 'Smiles#2', 'T#2', 'X#1 (molar fraction)', 'X#2 (molar fraction)', 'Tmelt, K', 'Reference (DOI)', 'valid_smiles_1', 'valid_smiles_2']


In [16]:
df_valid.head()

Unnamed: 0.1,Unnamed: 0,Number of components,Type of DES,Component#1,Smiles#1,T#1,Component#2,Smiles#2,T#2,X#1 (molar fraction),X#2 (molar fraction),"Tmelt, K",Reference (DOI)
0,0,2,,,Cl[Fe](Cl)Cl,580.15,,C[N+](C)(C)CCO.[Cl-],575.15,0.67,0.33,294.15,
1,1,2,,,Br[Zn]Br,667.15,,CC(=O)[O-].C[N+](C)(C)CCO,324.15,0.67,0.33,321.15,
2,2,2,,,Br[Zn]Br,567.15,,C[N+](C)(C)CCO.[Cl-],575.15,0.67,0.33,311.15,
3,3,2,,,Cl[Zn]Cl,563.15,,C[N+](C)(C)CCCl.Cl,513.15,0.67,0.33,296.15,
4,4,2,,,Cl[Zn]Cl,563.15,,C[N+](C)(C)CCO.[Cl-],575.15,0.5,0.5,314.77,
