## **Importing Libraries**

In [1]:
%%capture
!pip install mordred
!pip install rdkit


In [2]:
# Importing Libraries
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

pd.set_option('display.max_columns', 2000)
warnings.filterwarnings("ignore")

In [4]:
sns.set(style='whitegrid')

In [13]:
df = pd.read_csv('4-3D descriptors RDKIT.csv')

## **Data Preprocessing**



1.   Removing missing values/non-numerical values
2.   Remove constant values
3.   Remove highly correlated values





In [14]:
# Drop duplicated rows
df1 = df.drop_duplicates()


In [15]:
df1.shape

(17597, 12)

In [16]:
df1.head()

Unnamed: 0,values,PMI1,PMI2,PMI3,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,Eccentricity,Asphericity,SpherocityIndex,PBF
0,1,135.513538,255.808138,322.030592,0.42081,0.79436,1.68246,0.005862,0.907149,0.210838,0.199398,0.395514
1,1,1012.625649,3430.880212,4079.681989,0.248212,0.840968,3.723104,0.00083,0.968706,0.431572,0.208437,0.833922
2,1,7289.793059,10527.18365,14056.60532,0.518603,0.748914,4.65984,0.000103,0.855015,0.1353,0.434389,1.641599
3,1,1465.559555,6004.298168,7318.618475,0.200251,0.820414,4.357687,0.00056,0.979745,0.517477,0.068108,0.510451
4,1,1966.818432,9982.980466,11066.41502,0.177729,0.902097,4.81412,0.000459,0.98408,0.559647,0.16112,0.954113


In [None]:
df1.isnull().sum().sum()

In [18]:
column_num = []
column_bool = []
for column in df1.columns:
  column_type = df1[column].dtype
  if column_type == 'object':
      pass
  elif column_type =='bool':
      column_bool.append(column)
  else:
      column_num.append(column)

In [None]:
len(column_num)

In [None]:
column_bool

In [21]:
# To calclulate  Correlation and remove highly  correlated columns
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
# Dropping highly correlated Features
corr_features = correlation(df1, 0.80)
print("No. of features to drop : ",len(set(corr_features)))

df1.drop(corr_features,axis=1,inplace=True)

In [None]:
df1.shape

In [24]:
df1.head()

Unnamed: 0,values,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF
0,1,135.513538,255.808138,0.42081,0.79436,1.68246,0.005862,0.199398,0.395514
1,1,1012.625649,3430.880212,0.248212,0.840968,3.723104,0.00083,0.208437,0.833922
2,1,7289.793059,10527.18365,0.518603,0.748914,4.65984,0.000103,0.434389,1.641599
3,1,1465.559555,6004.298168,0.200251,0.820414,4.357687,0.00056,0.068108,0.510451
4,1,1966.818432,9982.980466,0.177729,0.902097,4.81412,0.000459,0.16112,0.954113


In [25]:
df1.to_csv('RDKit_3D_truncated.csv', index=None)

In [None]:
df1['values'].info()

In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x=df1["values"], palette="coolwarm")  # Replace "target" with actual column name
plt.title("Class Distribution")
plt.xlabel("Class (0 or 1)")
plt.ylabel("Count")
plt.show()

In [31]:
from sklearn.preprocessing import StandardScaler
scaled_DF = pd.DataFrame(StandardScaler().fit_transform(df1.iloc[:,1:]), columns=df1.iloc[:,1:].columns)

In [32]:
scaled_DF.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF
0,-0.394453,-0.253658,1.34369,-0.597623,-1.924884,0.524304,0.162323,-1.331211
1,-0.245471,-0.171661,0.0503,-0.142374,-0.548211,-0.043531,0.238432,-0.064677
2,0.820737,0.011602,2.07652,-1.041524,0.083736,-0.125664,2.140936,2.268654
3,-0.168538,-0.105202,-0.309104,-0.343132,-0.120105,-0.074081,-0.943124,-0.999163
4,-0.083397,-0.002452,-0.477879,0.454718,0.187818,-0.085495,-0.159973,0.282549


In [33]:
y = df1['values']

In [35]:
y.head()

Unnamed: 0,values
0,1
1,1
2,1
3,1
4,1


In [None]:
y.shape

In [None]:
scaled_DF.shape

# **SMOTE**

In [None]:
!pip install imbalanced-learn

In [39]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE to balance the entire dataset, increasing minority class instances
smote = SMOTE(sampling_strategy={0: 11000, 1: 11000}, random_state=42)

# Now use the imputed data for SMOTE
X_resampled, y_resampled = smote.fit_resample(scaled_DF, y)

In [40]:
scaler = StandardScaler()
X_resampled_scaled = pd.DataFrame(scaler.fit_transform(X_resampled), columns = X_resampled.columns)

In [41]:
# Combine scaled features and target variable
final_df = pd.concat([X_resampled_scaled, y_resampled], axis=1)

# Save the DataFrame to a CSV file
final_df.to_csv('RDKit3D_scaled_data.csv', index=False)

In [42]:
X_resampled_scaled.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883
1,-0.25662,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375
3,-0.174179,-0.106656,-0.30693,-0.348407,-0.117882,-0.080889,-0.94596,-1.007047
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.16423,0.28157


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count each class and store in a variable
class_counts = final_df['values'].value_counts()

# Print the class counts
print(class_counts)

# Create a countplot
plt.figure(figsize=(5, 4))
sns.countplot(x=final_df["values"], palette="coolwarm")
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [45]:
y = final_df['values']

In [46]:
y.shape

(22000,)

In [47]:
X = final_df.drop('values', axis=1)

In [48]:
X.shape

(22000, 8)

In [49]:
X.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883
1,-0.25662,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375
3,-0.174179,-0.106656,-0.30693,-0.348407,-0.117882,-0.080889,-0.94596,-1.007047
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.16423,0.28157
