### Imports

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

### Reading in Dataset and Initial Exploration

In [2]:
df = pd.read_csv('./DATA/tox21.csv')

In [3]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7831 entries, 0 to 7830
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   NR-AR          7265 non-null   float64
 1   NR-AR-LBD      6758 non-null   float64
 2   NR-AhR         6549 non-null   float64
 3   NR-Aromatase   5821 non-null   float64
 4   NR-ER          6193 non-null   float64
 5   NR-ER-LBD      6955 non-null   float64
 6   NR-PPAR-gamma  6450 non-null   float64
 7   SR-ARE         5832 non-null   float64
 8   SR-ATAD5       7072 non-null   float64
 9   SR-HSE         6467 non-null   float64
 10  SR-MMP         5810 non-null   float64
 11  SR-p53         6774 non-null   float64
 12  mol_id         7831 non-null   object 
 13  smiles         7831 non-null   object 
dtypes: float64(12), object(2)
memory usage: 856.6+ KB


In [5]:
df.describe()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
count,7265.0,6758.0,6549.0,5821.0,6193.0,6955.0,6450.0,5832.0,7072.0,6467.0,5810.0,6774.0
mean,0.042533,0.03507,0.11727,0.051538,0.128048,0.050324,0.028837,0.161523,0.03733,0.057523,0.158003,0.062445
std,0.201815,0.183969,0.321766,0.22111,0.33417,0.218627,0.167362,0.368044,0.189583,0.232857,0.364776,0.241979
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df.shape

(7831, 14)

## Target Exploration

#### Finding the continuous and categorical values in the Dataset 

In [15]:
target_df = df.drop(['smiles', 'mol_id'], axis=1)

print(f"Value Counts")
for col in target_df.columns:
   print("-----------------------------") 
   print(target_df[col].value_counts())
   print("-----------------------------")
   print("\n")

Value Counts
-----------------------------
NR-AR
0.0    6956
1.0     309
Name: count, dtype: int64
-----------------------------


-----------------------------
NR-AR-LBD
0.0    6521
1.0     237
Name: count, dtype: int64
-----------------------------


-----------------------------
NR-AhR
0.0    5781
1.0     768
Name: count, dtype: int64
-----------------------------


-----------------------------
NR-Aromatase
0.0    5521
1.0     300
Name: count, dtype: int64
-----------------------------


-----------------------------
NR-ER
0.0    5400
1.0     793
Name: count, dtype: int64
-----------------------------


-----------------------------
NR-ER-LBD
0.0    6605
1.0     350
Name: count, dtype: int64
-----------------------------


-----------------------------
NR-PPAR-gamma
0.0    6264
1.0     186
Name: count, dtype: int64
-----------------------------


-----------------------------
SR-ARE
0.0    4890
1.0     942
Name: count, dtype: int64
-----------------------------


------------------

#### Null Value Analysis

In [21]:
df.isnull().sum()

NR-AR             566
NR-AR-LBD        1073
NR-AhR           1282
NR-Aromatase     2010
NR-ER            1638
NR-ER-LBD         876
NR-PPAR-gamma    1381
SR-ARE           1999
SR-ATAD5          759
SR-HSE           1364
SR-MMP           2021
SR-p53           1057
mol_id              0
smiles              0
dtype: int64

In [22]:
percent_null = df.isnull().mean() * 100
percent_null

NR-AR             7.227685
NR-AR-LBD        13.701954
NR-AhR           16.370834
NR-Aromatase     25.667220
NR-ER            20.916869
NR-ER-LBD        11.186311
NR-PPAR-gamma    17.635040
SR-ARE           25.526753
SR-ATAD5          9.692249
SR-HSE           17.417954
SR-MMP           25.807687
SR-p53           13.497638
mol_id            0.000000
smiles            0.000000
dtype: float64

In [25]:
null_counts_per_row = df.isnull().sum(axis=1)
max_null_count = null_counts_per_row.max()
print(max_null_count)

11


In [26]:
rows_with_max_nulls = df[null_counts_per_row == max_null_count]
rows_with_max_nulls

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
137,,,,,,,,,,0.0,,,TOX25914,NCC(=O)CCC(=O)O
322,0.0,,,,,,,,,,,,TOX25881,CCCCC(CC)CNC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)NCC(C...
343,,,,,,,,,,0.0,,,TOX25083,Cc1cc(N)c2cc(NC(=O)Nc3ccc4nc(C)cc(N)c4c3)ccc2n1
456,,,,,,,,,,1.0,,,TOX31681,Nc1cc(C(F)(F)F)ccc1S
557,,,,,,,,,,0.0,,,TOX26385,Cc1c(C)n(CCN(C)C)c2ccc(C(=O)OCCN(C)C)cc12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6977,,,,,,,,,,1.0,,,TOX25922,CCOC(=O)[C@H](CCc1ccccc1)N[C@H]1CCc2ccccc2N(CC...
7325,,,,,,,,,,0.0,,,TOX26158,Cc1cc(CC(=O)O)c(C)n1-c1ccc(Cl)cc1
7653,,,,,,,,1.0,,,,,TOX1729,O=c1oc2ccccc2c(O)c1Cc1c(O)c2ccccc2oc1=O
7722,,,,,,,,,,0.0,,,TOX26755,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)...


Since about 1.2% of the dataset is currently null values, they are allowed to remain. Further analysis will help to identify if they are to be removed. 