### Imports

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

### Data Read-In and Feature Engineering

In [2]:
df = pd.read_csv('./DATA/tox21.csv')

In [3]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


### Engineering of Physicochemical Features from SMILES

#### Imports of Required Libraries

In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [5]:
features_df = df[['mol_id', 'smiles']]

In [6]:
features_df.head()

Unnamed: 0,mol_id,smiles
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [7]:
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        try:
            total_atoms = mol.GetNumAtoms()
            aromatic_atoms = len([atom for atom in mol.GetAromaticAtoms()])
            aromatic_proportion = aromatic_atoms / total_atoms if total_atoms > 0 else 0
            
            logP = Descriptors.MolLogP(mol)
            mw = Descriptors.MolWt(mol)
            rot_bonds = Lipinski.NumRotatableBonds(mol)
            logS = 0.16 - 0.63 * logP - 0.0062 * mw + 0.066 * rot_bonds - 0.74 * aromatic_proportion
            
            pos_charge = sum(1 for atom in mol.GetAtoms() if atom.GetFormalCharge() > 0)
            neg_charge = sum(1 for atom in mol.GetAtoms() if atom.GetFormalCharge() < 0)
            formal_charge = Chem.GetFormalCharge(mol)
            
            return pd.Series({
                "MolecularWeight": mw,
                "LogP": logP,
                "TPSA": Descriptors.TPSA(mol),
                "HBDonors": Descriptors.NumHDonors(mol),
                "HBAcceptors": Descriptors.NumHAcceptors(mol),
                "RotatableBonds": rot_bonds,
                "FractionCSP3": Descriptors.FractionCSP3(mol),
                "HeavyAtoms": mol.GetNumHeavyAtoms(),
                "RingCount": Descriptors.RingCount(mol),
                "AromaticProportion": aromatic_proportion,
                "LogS_ESOL": logS,
                "PositiveCharges": pos_charge,
                "NegativeCharges": neg_charge,
                "FormalCharge": formal_charge
            })
        except:
            return pd.Series([None] * 14)
    else:
        return pd.Series([None] * 14)


In [10]:
descriptor_names = [
    'MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
    'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
    'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges', 'FormalCharge'
]

desc_df = features_df['smiles'].apply(compute_descriptors)
desc_df.columns = descriptor_names



In [11]:
desc_df.head()

Unnamed: 0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,LogS_ESOL,PositiveCharges,NegativeCharges,FormalCharge
0,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,-2.505571,0.0,0.0,0.0
1,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,-2.088842,0.0,0.0,0.0
2,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,-4.769434,0.0,0.0,0.0
3,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,-3.677866,0.0,0.0,0.0
4,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,-0.360281,0.0,0.0,0.0


In [12]:
final_df = pd.concat([features_df, desc_df], axis=1)

In [13]:
final_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,LogS_ESOL,PositiveCharges,NegativeCharges,FormalCharge
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,-2.505571,0.0,0.0,0.0
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,-2.088842,0.0,0.0,0.0
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,-4.769434,0.0,0.0,0.0
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,-3.677866,0.0,0.0,0.0
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,-0.360281,0.0,0.0,0.0


#### Checking for Null Values

In [14]:
final_df.isnull().sum()

mol_id                0
smiles                0
MolecularWeight       0
LogP                  0
TPSA                  0
HBDonors              0
HBAcceptors           0
RotatableBonds        0
FractionCSP3          0
HeavyAtoms            0
RingCount             0
AromaticProportion    0
LogS_ESOL             0
PositiveCharges       0
NegativeCharges       0
FormalCharge          0
dtype: int64

### Engineering of Topological Descriptors

In [19]:
def compute_topological_descriptors(smiles):

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return pd.Series([None]*20)
    ring_info = mol.GetRingInfo().AtomRings()

    ## number of aromatic rings 
    aromatic_rings = len([r for r in ring_info if all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in r)])
    
    ## number of aromatic heterocycles 
    aromatic_heterocycles = len([
        r for r in ring_info
        if all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in r) and
           any(mol.GetAtomWithIdx(i).GetAtomicNum() in [7, 8, 16] for i in r)
        ])

    ## number of aliphatic rings
    aliphatic_rings = Descriptors.NumAliphaticRings(mol)

    ## molecular complexity 
    mol_complexity = Descriptors.FpDensityMorgan1(mol)

    ## molar refractivity 
    molar_refractivity = Descriptors.MolMR(mol)

    return pd.Series({
        "AromaticRings": aromatic_rings,
        "AromaticHeterocycles": aromatic_heterocycles,
        "AliphaticRings": aliphatic_rings,
        "MolecularComplexity": mol_complexity,
        "MolarRefractivity": molar_refractivity
    })

In [20]:
topological_desc = features_df['smiles'].apply(compute_topological_descriptors)



In [21]:
topological_desc.head()

Unnamed: 0,AromaticRings,AromaticHeterocycles,AliphaticRings,MolecularComplexity,MolarRefractivity
0,2.0,1.0,0.0,1.5,62.1622
1,1.0,0.0,1.0,1.266667,55.1017
2,0.0,0.0,4.0,1.142857,86.9438
3,1.0,0.0,0.0,1.2,86.1627
4,0.0,0.0,0.0,1.0,34.712


In [22]:
final_df = pd.concat([final_df, topological_desc], axis=1)

final_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,...,AromaticProportion,LogS_ESOL,PositiveCharges,NegativeCharges,FormalCharge,AromaticRings,AromaticHeterocycles,AliphaticRings,MolecularComplexity,MolarRefractivity
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,...,0.5625,-2.505571,0.0,0.0,0.0,2.0,1.0,0.0,1.5,62.1622
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,...,0.4,-2.088842,0.0,0.0,0.0,1.0,0.0,1.0,1.266667,55.1017
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,...,0.0,-4.769434,0.0,0.0,0.0,0.0,0.0,4.0,1.142857,86.9438
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,...,0.3,-3.677866,0.0,0.0,0.0,1.0,0.0,0.0,1.2,86.1627
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,...,0.0,-0.360281,0.0,0.0,0.0,0.0,0.0,0.0,1.0,34.712


#### Checking for Null Values

In [24]:
final_df.isnull().sum()

mol_id                  0
smiles                  0
MolecularWeight         0
LogP                    0
TPSA                    0
HBDonors                0
HBAcceptors             0
RotatableBonds          0
FractionCSP3            0
HeavyAtoms              0
RingCount               0
AromaticProportion      0
LogS_ESOL               0
PositiveCharges         0
NegativeCharges         0
FormalCharge            0
AromaticRings           0
AromaticHeterocycles    0
AliphaticRings          0
MolecularComplexity     0
MolarRefractivity       0
dtype: int64

### Engineering of Consitutional Descriptors 

In [25]:
def count_heteroatoms(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() in [7, 8, 15, 16])  # N, O, P, S

def count_halogens(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() in [9, 17, 35, 53])  # F, Cl, Br, I

def count_phenols(mol):
    phenol_smarts = Chem.MolFromSmarts('c[OH]')
    return len(mol.GetSubstructMatches(phenol_smarts))

def compute_constitutional_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return pd.Series([None] * 3, index=["Heteroatoms", "HalogenCount", "PhenolicGroups"])

    return pd.Series({
        "Heteroatoms": count_heteroatoms(mol),
        "HalogenCount": count_halogens(mol),
        "PhenolicGroups": count_phenols(mol)
    })

In [26]:
constitutional_desc = features_df['smiles'].apply(compute_constitutional_descriptors)



In [27]:
final_df = pd.concat([final_df, constitutional_desc], axis=1)

final_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,...,NegativeCharges,FormalCharge,AromaticRings,AromaticHeterocycles,AliphaticRings,MolecularComplexity,MolarRefractivity,Heteroatoms,HalogenCount,PhenolicGroups
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,...,0.0,0.0,2.0,1.0,0.0,1.5,62.1622,7,0,0
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,...,0.0,0.0,1.0,0.0,1.0,1.266667,55.1017,4,0,0
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,...,0.0,0.0,0.0,0.0,4.0,1.142857,86.9438,1,0,0
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,...,0.0,0.0,1.0,0.0,0.0,1.2,86.1627,3,0,0
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,...,0.0,0.0,0.0,0.0,0.0,1.0,34.712,9,0,0


#### Checking for Null Values

In [29]:
final_df.isnull().sum()

mol_id                  0
smiles                  0
MolecularWeight         0
LogP                    0
TPSA                    0
HBDonors                0
HBAcceptors             0
RotatableBonds          0
FractionCSP3            0
HeavyAtoms              0
RingCount               0
AromaticProportion      0
LogS_ESOL               0
PositiveCharges         0
NegativeCharges         0
FormalCharge            0
AromaticRings           0
AromaticHeterocycles    0
AliphaticRings          0
MolecularComplexity     0
MolarRefractivity       0
Heteroatoms             0
HalogenCount            0
PhenolicGroups          0
dtype: int64

In [30]:
final_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,...,NegativeCharges,FormalCharge,AromaticRings,AromaticHeterocycles,AliphaticRings,MolecularComplexity,MolarRefractivity,Heteroatoms,HalogenCount,PhenolicGroups
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,...,0.0,0.0,2.0,1.0,0.0,1.5,62.1622,7,0,0
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,...,0.0,0.0,1.0,0.0,1.0,1.266667,55.1017,4,0,0
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,...,0.0,0.0,0.0,0.0,4.0,1.142857,86.9438,1,0,0
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,...,0.0,0.0,1.0,0.0,0.0,1.2,86.1627,3,0,0
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,...,0.0,0.0,0.0,0.0,0.0,1.0,34.712,9,0,0


In [31]:
final_df.shape

(7831, 24)

In [32]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7831 entries, 0 to 7830
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   mol_id                7831 non-null   object 
 1   smiles                7831 non-null   object 
 2   MolecularWeight       7831 non-null   float64
 3   LogP                  7831 non-null   float64
 4   TPSA                  7831 non-null   float64
 5   HBDonors              7831 non-null   float64
 6   HBAcceptors           7831 non-null   float64
 7   RotatableBonds        7831 non-null   float64
 8   FractionCSP3          7831 non-null   float64
 9   HeavyAtoms            7831 non-null   float64
 10  RingCount             7831 non-null   float64
 11  AromaticProportion    7831 non-null   float64
 12  LogS_ESOL             7831 non-null   float64
 13  PositiveCharges       7831 non-null   float64
 14  NegativeCharges       7831 non-null   float64
 15  FormalCharge         

In [33]:
final_df.describe()

Unnamed: 0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NegativeCharges,FormalCharge,AromaticRings,AromaticHeterocycles,AliphaticRings,MolecularComplexity,MolarRefractivity,Heteroatoms,HalogenCount,PhenolicGroups
count,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,...,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0,7831.0
mean,276.318652,2.365648,59.616917,1.22909,3.56736,4.310177,0.458674,18.573873,1.771166,0.325973,...,0.142511,-0.032435,1.104712,0.266505,0.670796,1.12814,72.164059,4.370706,0.56404,0.142127
std,165.817631,2.365318,58.950671,1.947032,3.245829,4.485703,0.325157,11.345818,1.667047,0.281887,...,0.455835,0.371693,1.101615,0.557178,1.293925,0.327259,41.899839,3.966798,1.454532,0.570319
min,9.012,-44.162,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,-5.0,0.0,0.0,0.0,0.131579,0.0,0.0,0.0,0.0
25%,165.214,1.148,26.3,0.0,2.0,1.0,0.181818,11.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.909091,43.805,2.0,0.0,0.0
50%,240.302,2.3651,46.53,1.0,3.0,3.0,0.416667,16.0,1.0,0.357143,...,0.0,0.0,1.0,0.0,0.0,1.142857,63.6683,4.0,0.0,0.0
75%,343.044,3.65245,77.075,2.0,5.0,6.0,0.727273,23.0,3.0,0.545455,...,0.0,0.0,2.0,0.0,1.0,1.333333,91.0576,6.0,0.0,0.0
max,1999.065,22.6118,1095.85,32.0,67.0,47.0,1.0,132.0,30.0,1.0,...,5.0,3.0,12.0,5.0,30.0,2.0,453.3684,75.0,24.0,25.0


In [34]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [35]:
targets_df = df.drop('smiles', axis=1)

In [36]:
targets_df

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020
2,,,,,,,,0.0,,0.0,,,TOX3024
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,,,,,,,,0.0,,0.0,,,TOX2725
7827,1.0,1.0,0.0,0.0,1.0,0.0,,,0.0,0.0,,0.0,TOX2370
7828,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX2371
7829,1.0,1.0,0.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,TOX2377


In [46]:
total_tox_df = pd.merge(final_df, targets_df, on='mol_id')

In [47]:
total_tox_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,...,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,...,,,,,,0.0,,0.0,,
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
total_tox_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,...,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,...,,,,,,0.0,,0.0,,
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
total_tox_df.rename(columns={'smiles_x': 'smiles'}, inplace=True)

In [51]:
total_tox_df.head()

Unnamed: 0,mol_id,smiles,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,...,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,...,,,,,,0.0,,0.0,,
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0
4,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
total_tox_df.to_csv('./DATA/total_tox_data.csv')