In [1]:
import pickle
import pandas as pd
import rdkit
from rdkit import Chem
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# About data set

In [2]:
# Load training data, val and test data
file_path = 'train.pkl'
with open(file_path, 'rb') as file:
    train_df = pickle.load(file)
#print(train_df)
# good fit = lower negative number
# more negative docking score = better fit 


# Load validation data
file_path = 'validation_small_enantiomers_stable_full_screen_docking_MOL_margin3_49878_10368_5184.pkl'
with open(file_path, 'rb') as file:
    val_df = pickle.load(file)
#print(val_df)


# Load test data
file_path = 'test_small_enantiomers_stable_full_screen_docking_MOL_margin3_50571_10368_5184.pkl'
with open(file_path, 'rb') as file:
    test_df = pickle.load(file)
#print(test_df)


In [3]:
train_df.head(5)

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,score0,score1,score2,top_score,range_scores,mean_score,top_score_enantiomers_range
0,BrC1([C@@H]2CC23CC3)CC1,BrC1(C2CC23CC3)CC1,<rdkit.Chem.rdchem.Mol object at 0x1518dd827ce0>,-4.6,-4.6,-4.6,-4.6,0.0,-4.6,0.4
1,BrC1([C@@H]2CC23CC3)CC1,BrC1(C2CC23CC3)CC1,<rdkit.Chem.rdchem.Mol object at 0x1518dd827d30>,-4.6,-4.6,-4.6,-4.6,0.0,-4.6,0.4
2,BrC1([C@@H]2CC23CC3)CC1,BrC1(C2CC23CC3)CC1,<rdkit.Chem.rdchem.Mol object at 0x1518dd827d80>,-4.6,-4.6,-4.6,-4.6,0.0,-4.6,0.4
3,BrC1([C@H]2CC23CC3)CC1,BrC1(C2CC23CC3)CC1,<rdkit.Chem.rdchem.Mol object at 0x1518dd827dd0>,-5.0,-5.0,-5.0,-5.0,0.0,-5.0,0.4
4,BrC1([C@H]2CC23CC3)CC1,BrC1(C2CC23CC3)CC1,<rdkit.Chem.rdchem.Mol object at 0x1518dd827e20>,-5.0,-5.0,-5.0,-5.0,0.0,-5.0,0.4


In [6]:
# Number of enantiomers in training data 
grouped = train_df.groupby('SMILES_nostereo')
enantiomer_counts = grouped['ID'].nunique()
print(f"Total conformers in train data: {len(train_df)}")
print(f" Enantiomers in train data: {train_df['SMILES_nostereo'].nunique()}")

# Number of enantiomers in val data 
grouped = val_df.groupby('SMILES_nostereo')
enantiomer_counts = grouped['ID'].nunique()
print(f"Total conformers in val data: {len(val_df)}")
print(f" Enantiomers: {val_df['SMILES_nostereo'].nunique()}")

# Number of enantiomers in test data 
grouped = test_df.groupby('SMILES_nostereo')
enantiomer_counts = grouped['ID'].nunique()
print(f"Total conformers in test data: {len(test_df)}")
print(f" Enantiomers in test data : {test_df['SMILES_nostereo'].nunique()}")

Total conformers in train data: 234622
 Enantiomers in train data: 24192
Total conformers in val data: 49878
 Enantiomers: 5184
Total conformers in test data: 50571
 Enantiomers in test data : 5184


In [5]:
atomic_numbers_train = []
for mol in train_df['rdkit_mol_cistrans_stereo']:
    if mol is not None and isinstance(mol, Chem.Mol): 
        for atom in mol.GetAtoms():
            atomic_numbers_train.append(atom.GetAtomicNum())



atomic_numbers_val = []
for mol in val_df['rdkit_mol_cistrans_stereo']:
    if mol is not None and isinstance(mol, Chem.Mol): 
        for atom in mol.GetAtoms():
            atomic_numbers_val.append(atom.GetAtomicNum())


atomic_numbers_test = []
for mol in test_df['rdkit_mol_cistrans_stereo']:
    if mol is not None and isinstance(mol, Chem.Mol): 
        for atom in mol.GetAtoms():
            atomic_numbers_test.append(atom.GetAtomicNum())

In [7]:
frequency_distribution_train = pd.Series(Counter(atomic_numbers_train)).sort_index()
print("Frequency Distribution of Atomic Numbers in training data:")
print(frequency_distribution_train)


frequency_distribution_val = pd.Series(Counter(atomic_numbers_val)).sort_index()
print("Frequency Distribution of Atomic Numbers in val data:")
print(frequency_distribution_val)


frequency_distribution_test = pd.Series(Counter(atomic_numbers_test)).sort_index()
print("Frequency Distribution of Atomic Numbers in test data:")
print(frequency_distribution_test)
# so we have hydrogen (1), carbon (6), nitrogen (7), oxygen (8), fluorine (9), 
# Phosphorus (15), sulfur (16), chlorine (17), bromine (35), iodine (53)

Frequency Distribution of Atomic Numbers in training data:
1        6102
6     2394335
7      426711
8      329885
9       40434
15       1083
16      35972
17      14200
35        885
53         29
dtype: int64
Frequency Distribution of Atomic Numbers in val data:
1       1404
6     503777
7      90858
8      71431
9       8844
15       269
16      7521
17      3062
35       187
dtype: int64
Frequency Distribution of Atomic Numbers in test data:
1        920
6     512279
7      94420
8      72360
9       7983
15       242
16      7484
17      3044
35       189
53        10
dtype: int64


In [7]:


# Calculate statistics for each dataset
def get_stats(df, dataset_name):
    return {
        'Dataset': dataset_name,
        'Count': len(df),
        'Min': df['top_score'].min(),
        'Mean': df['top_score'].mean(),
        'Max': df['top_score'].max(),
        'Std': df['top_score'].std()
    }

# Create statistics table
stats_data = [
    get_stats(train_df, 'Train'),
    get_stats(val_df, 'Validation'), 
    get_stats(test_df, 'Test')
]

stats_df = pd.DataFrame(stats_data)
print("Statistics for Top Scores:")
print(stats_df.to_string(index=False))

Statistics for Top Scores:
   Dataset  Count  Min      Mean  Max      Std
     Train 234622 -9.6 -5.588863  1.9 0.924873
Validation  49878 -9.2 -5.555199  2.1 0.936941
      Test  50571 -9.2 -5.591371  1.0 0.911111


In [None]:

train_df_copy = train_df.copy()
val_df_copy = val_df.copy()
test_df_copy = test_df.copy()

train_df_copy['Dataset'] = 'Train'
val_df_copy['Dataset'] = 'Validation'
test_df_copy['Dataset'] = 'Test'

combined_df = pd.concat(
    [
        train_df_copy[['top_score', 'Dataset']],
        val_df_copy[['top_score', 'Dataset']],
        test_df_copy[['top_score', 'Dataset']]
    ],
    ignore_index=True
)


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams.update({
    "figure.dpi": 100,          # screen dpi; not used for PDF
    "font.size": 12,            # base font size
    "axes.labelsize": 12,
    "axes.titlesize": 13,
    "legend.fontsize": 10,
})

sns.set_theme(style="whitegrid")


fig, ax = plt.subplots(figsize=(6.5, 3.2), constrained_layout=True)

sns.kdeplot(
    data=combined_df,
    x='top_score',
    hue='Dataset',
    fill=True,
    alpha=0.6,
    ax=ax
)

ax.set_xlabel('Top Score')
ax.set_ylabel('Density')
ax.set_title('Density Distribution of Top Scores')

# If you *must* use PNG, crank up DPI:
fig.savefig("image1.png", dpi=600, bbox_inches='tight')

plt.close(fig)


In [8]:

def atomic_number_to_symbol(atomic_number):
    periodic_table = {
        1: 'H', 6: 'C', 7: 'N', 8: 'O', 9: 'F',
        15: 'P', 16: 'S', 17: 'Cl', 35: 'Br', 53: 'I'
    }
    return periodic_table.get(atomic_number, 'Unknown')

file_path = 'train.pkl'
with open(file_path, 'rb') as file:
    df = pickle.load(file)

mollist = []
for mol in df['rdkit_mol_cistrans_stereo']:
    molecule = []
    if mol is not None and isinstance(mol, Chem.Mol):
        
        if mol.GetNumConformers() > 0:
            conf = mol.GetConformer(0)  #
            for atom in mol.GetAtoms():
                atomic_number = atom.GetAtomicNum()
                atom_type = atomic_number_to_symbol(atomic_number)
                # Get 3D coordinates
                pos = conf.GetAtomPosition(atom.GetIdx())
                coordinates = np.array([pos.x, pos.y, pos.z])
                molecule.append((atom_type, coordinates))
    mollist.append(molecule)
print("Sample of mollist (first molecule):")
print(mollist[0] if mollist else "No valid molecules found.")

Sample of mollist (first molecule):
[('Br', array([-2.04649997, -1.8743    ,  0.33809999])), ('C', array([1.38030005, 0.0871    , 0.14910001])), ('C', array([ 0.0986    , -0.0115    ,  0.83929998])), ('C', array([0.90759999, 1.29069996, 0.8251    ])), ('C', array([ 2.61890006, -0.6832    ,  0.1062    ])), ('C', array([ 1.89660001, -0.2475    , -1.17390001])), ('C', array([-1.22039998, -0.1279    ,  0.1161    ])), ('C', array([-1.46070004,  0.5363    , -1.21200001])), ('C', array([-2.17440009,  1.03050005,  0.012     ]))]


In [16]:
type(mollist)

list

In [17]:
len(mollist)

234622

In [9]:
import numpy as np

def speciesmap(atom_type):
    atom_to_number = {
        'H': 1,   # Hydrogen
        'C': 6,   # Carbon
        'N': 7,   # Nitrogen
        'O': 8,   # Oxygen
        'F': 9,   # Fluorine
        'P': 15,  # Phosphorus
        'S': 16,  # Sulfur
        'Cl': 17, # Chlorine
        'Br': 35, # Bromine
        'I': 53   # Iodine
    }
    return np.array([atom_to_number.get(atom_type, 0)])  # Returns 0 if atom type is not recognized

In [12]:
%%time
import qm7_weightedviews as CP
from qm7_weightedviews import load_qm7_data
ws, vs, Natoms, Nviews = load_qm7_data(mollist, speciesmap, setNatoms=None, setNviews=None, carbonbased=False, verbose=0)

CPU times: user 20min 2s, sys: 1.81 s, total: 20min 4s
Wall time: 20min 8s


In [13]:
print(ws.shape)
print(vs.shape)

(234622, 29)
(234622, 29, 116)


In [9]:
#print(df[['score0', 'score1', 'score2','range_scores', 'top_score','mean_score', 'top_score_enantiomers_range' ]].head(n=20))