# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Data Read-In

In [2]:
df = pd.read_csv('./DATA/total_tox_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,...,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
2,2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,...,,,,,,0.0,,0.0,,
3,3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
4,4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df = df.drop('Unnamed: 0', axis=1)
df.set_index('mol_id', inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,,,,,,0.0,,0.0,,
TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature-Targets Split

In [7]:
df.columns

Index(['smiles', 'MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [8]:
features_df = df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups']]

targets_df = df[['NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']]

In [9]:
features_df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NegativeCharges,FormalCharge,AromaticRings,AromaticHeterocycles,AliphaticRings,MolecularComplexity,MolarRefractivity,Heteroatoms,HalogenCount,PhenolicGroups
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,0.0,0.0,2.0,1.0,0.0,1.5,62.1622,7,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0.0,0.0,1.0,0.0,1.0,1.266667,55.1017,4,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0.0,0.0,0.0,0.0,4.0,1.142857,86.9438,1,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0.0,0.0,1.0,0.0,0.0,1.2,86.1627,3,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,34.712,9,0,0


In [10]:
targets_df.head()

Unnamed: 0_level_0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
TOX3021,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
TOX3020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
TOX3024,,,,,,,,0.0,,0.0,,
TOX3027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
TOX20800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Technique 1: Class Conditional KNN Imputation

### Imports

In [11]:
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, f1_score

### Finding Best K Value and Weight using Grid Search and Partial Masking

In [20]:
def knn_impute_gridsearch(descriptor_df, toxicity_df, mask_fraction=0.1,
                          neighbors_list=[3, 5, 7, 10], weights_list=['uniform', 'distance'],
                          random_state=42):

    np.random.seed(random_state)

    X = descriptor_df.values          
    Y = toxicity_df.values            

    best_score = -1
    best_params = None
    best_imputed_df = None

    def evaluate(true, imputed, mask):
        true_masked = true[mask]
        imputed_masked = imputed[mask]
        acc = accuracy_score(true_masked, imputed_masked)
        f1 = f1_score(true_masked, imputed_masked)
        return acc, f1

    for k in neighbors_list:
        for w in weights_list:

            mask = np.full(Y.shape, False)
            known_indices = np.array(np.where(~np.isnan(Y))).T
            n_mask = int(len(known_indices) * mask_fraction)
            mask_indices = known_indices[np.random.choice(len(known_indices), n_mask, replace=False)]
            for i, j in mask_indices:
                mask[i, j] = True

            Y_masked = Y.copy()
            Y_masked[mask] = np.nan

            combined = np.hstack([X, Y_masked])

            imputer = KNNImputer(n_neighbors=k, weights=w)
            imputed = imputer.fit_transform(combined)

            imputed_toxicity = imputed[:, X.shape[1]:]
            imputed_toxicity = (imputed_toxicity >= 0.5).astype(int)

            acc, f1 = evaluate(Y, imputed_toxicity, mask)

            print(f"k={k}, weights='{w}' => Accuracy: {acc:.4f}, F1-score: {f1:.4f}")

            if f1 > best_score:
                best_score = f1
                best_params = {'n_neighbors': k, 'weights': w}
                best_imputed_df = pd.DataFrame(imputed_toxicity, columns=toxicity_df.columns, index=toxicity_df.index)

    print(f"\nBest params: {best_params}, Best F1-score: {best_score:.4f}")

    return best_imputed_df, best_params

In [21]:
imputed_df, best_params = knn_impute_gridsearch(features_df, targets_df)

k=3, weights='uniform' => Accuracy: 0.9198, F1-score: 0.2350
k=3, weights='distance' => Accuracy: 0.9174, F1-score: 0.2829
k=5, weights='uniform' => Accuracy: 0.9247, F1-score: 0.1626
k=5, weights='distance' => Accuracy: 0.9264, F1-score: 0.2878
k=7, weights='uniform' => Accuracy: 0.9222, F1-score: 0.1833
k=7, weights='distance' => Accuracy: 0.9320, F1-score: 0.2639
k=10, weights='uniform' => Accuracy: 0.9255, F1-score: 0.1567
k=10, weights='distance' => Accuracy: 0.9246, F1-score: 0.2075

Best params: {'n_neighbors': 5, 'weights': 'distance'}, Best F1-score: 0.2878


#### The Low F1 score is an indicator of poor performance on the minority class, which is very crucial in problems such as toxicity. 

# Technique 2: MICE

In [25]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd

def mice_impute_evaluate(descriptor_df, toxicity_df, mask_fraction=0.1, random_state=42, max_iter=10):
    """
    Perform MICE imputation on toxicity data with some masked entries,
    evaluate accuracy and weighted F1 score of imputation.
    
    Parameters:
    - descriptor_df: pd.DataFrame of molecular descriptors (no missing values)
    - toxicity_df: pd.DataFrame of toxicity labels (may contain NaNs)
    - mask_fraction: fraction of toxicity entries to mask for testing
    - random_state: random seed for reproducibility
    - max_iter: max iterations for IterativeImputer (MICE)
    
    Returns:
    - imputed_toxicity_df: DataFrame of toxicity after imputation
    - metrics: dict with accuracy and weighted F1 score
    """

    X = descriptor_df.values
    Y = toxicity_df.values.astype(float)  

    combined = np.hstack([X, Y])

    valid_indices = np.argwhere(~np.isnan(Y))

    n_mask = int(mask_fraction * valid_indices.shape[0])

    np.random.seed(random_state)
    selected_mask_indices = valid_indices[np.random.choice(valid_indices.shape[0], n_mask, replace=False)]

    mask = np.zeros_like(Y, dtype=bool)
    mask[selected_mask_indices[:, 0], selected_mask_indices[:, 1]] = True

    combined[:, X.shape[1]:][mask] = np.nan

    imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)

    imputed = imputer.fit_transform(combined)

    imputed_toxicity = imputed[:, X.shape[1]:]

    imputed_toxicity = (imputed_toxicity >= 0.5).astype(int)

    true_values = Y[mask]
    imputed_values = imputed_toxicity[mask]

    valid_mask = ~np.isnan(true_values)
    true_values = true_values[valid_mask]
    imputed_values = imputed_values[valid_mask]
    accuracy = accuracy_score(true_values, imputed_values)
    f1 = f1_score(true_values, imputed_values, average='weighted')

    imputed_toxicity_df = pd.DataFrame(imputed_toxicity, columns=toxicity_df.columns, index=toxicity_df.index)

    metrics = {'accuracy': accuracy, 'weighted_f1_score': f1}
    print(f"Mask fraction: {mask_fraction}, Accuracy: {accuracy:.4f}, Weighted F1-score: {f1:.4f}")

    return imputed_toxicity_df, metrics

In [27]:
imputed_toxicity_df, metrics = mice_impute_evaluate(features_df, targets_df)

Mask fraction: 0.1, Accuracy: 0.9400, Weighted F1-score: 0.9309


In [28]:
metrics

{'accuracy': 0.9399538106235565, 'weighted_f1_score': 0.9309445187083174}

Since the weighted F1 score is 0.93, and the accuracy also is 0.94, indicating the balance among precision and recall, the MICE method is considered reliable for imputing the missing values in the DataFrame.

In [29]:
def fill_missing_toxicity(descriptor_df, toxicity_df, max_iter=10, random_state=0):
    combined = pd.concat([descriptor_df, toxicity_df], axis=1).values

    imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)

    imputed_combined = imputer.fit_transform(combined)

    imputed_toxicity = imputed_combined[:, descriptor_df.shape[1]:]

    imputed_toxicity_df = pd.DataFrame(imputed_toxicity, columns=toxicity_df.columns, index=toxicity_df.index)

    imputed_toxicity_df = imputed_toxicity_df.round().astype(int)

    return imputed_toxicity_df

In [30]:
filled_toxicity_df = fill_missing_toxicity(features_df, targets_df)

In [31]:
filled_toxicity_df.isnull().sum()

NR-AR            0
NR-AR-LBD        0
NR-AhR           0
NR-Aromatase     0
NR-ER            0
NR-ER-LBD        0
NR-PPAR-gamma    0
SR-ARE           0
SR-ATAD5         0
SR-HSE           0
SR-MMP           0
SR-p53           0
dtype: int64

In [37]:
for col in filled_toxicity_df.columns:
    print(df[col].value_counts())

NR-AR
0.0    7519
1.0     312
Name: count, dtype: int64
NR-AR-LBD
0.0    7560
1.0     271
Name: count, dtype: int64
NR-AhR
0.0    6986
1.0     845
Name: count, dtype: int64
NR-Aromatase
0.0    7420
1.0     411
Name: count, dtype: int64
NR-ER
0.0    6947
1.0     884
Name: count, dtype: int64
NR-ER-LBD
0.0    7453
1.0     378
Name: count, dtype: int64
NR-PPAR-gamma
0.0    7614
1.0     217
Name: count, dtype: int64
SR-ARE
0.0    6425
1.0    1404
2.0       2
Name: count, dtype: int64
SR-ATAD5
0.0    7560
1.0     271
Name: count, dtype: int64
SR-HSE
0.0    7392
1.0     439
Name: count, dtype: int64
SR-MMP
0.0    6560
1.0    1270
2.0       1
Name: count, dtype: int64
SR-p53
0.0    7291
1.0     540
Name: count, dtype: int64


In [38]:
df = pd.concat([features_df, filled_toxicity_df], axis=1)


In [39]:
df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df.to_csv('./DATA/filled_toxicity_df.csv')