In [182]:
import pandas as pd
import numpy as np

In [183]:
def nan_filter(data, thresh=80, value='zero'):
    """
    Purpose
    -------
    1. Drop columns with all the values as NaNs.
    2. Drop columns with NaNs over a certian limit.
    3. Replace NaNs with a value.
    
    Arguments
    ---------
    data: Dataframe.
    thresh: Threshold for non-NaN values; default is 80%.
    value: Value to replace NaN with; default is zero.
           Options: 'zero', 'mean', 'median'
    
    Returns
    -------
    A modified Pandas dataframe.
    """
    
    data = data.dropna(axis='columns', how='all')
    threshold = int((data.shape[0] * thresh) / 100)
    data = data.dropna(axis='columns', thresh=thresh)
    if value == 'zero':
        data = data.apply(lambda x: x.fillna(0)) 
    elif value == 'mean':
        data = data.apply(lambda x: x.fillna(x.mean()))
    elif value == 'median':
        data = data.apply(lambda x: x.fillna(x.median()))
  
    return data

In [184]:
# Load data.
df_2d = pd.read_pickle('../data/bitter_sweet_2d_descriptors.pkl.gz', compression='gzip')
df_3d = pd.read_pickle('../data/bitter_sweet_2d_plus_3d_descriptors.pkl.gz', compression='gzip')

In [185]:
print("Number of molecules:", df_2d.shape[0])
print("Total number of features in 2D dataset:", df_2d.shape[1])
print("Total number of features in 3D dataset:", df_3d.shape[1])

Number of molecules: 2393
Total number of features in 2D dataset: 1617
Total number of features in 3D dataset: 1830


In [186]:
# Drop columns with names, SMILES, Canonical SMILES, and Target values. 
df_2d_ = df_2d.drop(labels = ["Name","SMILES","Canonical SMILES", "Target"], axis = 1)
df_3d_ = df_3d.drop(labels = ["Name","SMILES","Canonical SMILES", "Target"], axis = 1)

In [187]:
# Check columns for data types and determine columns that are not numeric.
dt = df_2d_.dtypes
print("Number of columns with non-numeric datatype:", dt[dt == "object"].count())
dt_o = dt[dt == "object"]
dt_o_indx = dt_o.index
# df_2d[dt_o_indx] # Un comment to visualize the data frame.

# Some of the cells have string annotations from Mordred.
# Changing the data type of the dataframe will replace the annotations with Nans.
df_2d_ = df_2d_.astype('float64')
df_3d_ = df_3d_.astype("float64")

Number of columns with non-numeric datatype: 914


In [188]:
df_2d_[['Canonical SMILES', 'Target']] = df_2d[['Canonical SMILES', 'Target']]
df_3d_[['Canonical SMILES', 'Target']] = df_3d[['Canonical SMILES', 'Target']]


In [189]:
# Replace NaNs introduced either in feature calculation or in the step above.

# This will replace all NaNs with zeros. Check docstring for other options.
df_2d_zero = nan_filter(df_2d_) 
df_3d_zero = nan_filter(df_3d_)

In [190]:
# Save cleaned data frames for further analysis.
df_2d_zero.to_pickle('../output/df_2d_zero.pkl.gz', compression='gzip')
df_3d_zero.to_pickle('../output/df_3d_zero.pkl.gz', compression='gzip')