# Section-1: Import Libraries

In [1]:
#importing the necessary libraries
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from lightgbm import LGBMClassifier
import pickle as pkl
from boruta import BorutaPy
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Section-2: Data Preparation

In [2]:
#Reading the Original training and testing datasets
train=pd.read_csv("train_II.csv")
test=pd.read_csv("test_II.csv")

In [3]:
#Splitting the ID into Chemical_ID and Assay_ID
train[['SMILES', 'Assay_Id']] = train['Id'].str.split(';', expand=True)
test[['SMILES', 'Assay_Id']] = test['x'].str.split(';', expand=True)

#printing the columns of the testing and training datasets
print('Train Columns:', ', '.join(train.columns))
print('Test Columns:', ', '.join(test.columns))

Train Columns: Id, Expected, SMILES, Assay_Id
Test Columns: x, SMILES, Assay_Id


In [4]:
#Getting the Molecules from SMILES
train['Molecule'] = train['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
test['Molecule'] = test['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))    

[10:54:47] Explicit valence for atom # 1 Si, 8, is greater than permitted
[10:54:51] Explicit valence for atom # 1 Si, 8, is greater than permitted
[10:54:55] Explicit valence for atom # 1 Si, 8, is greater than permitted
[10:54:56] Explicit valence for atom # 1 Si, 8, is greater than permitted
[10:54:59] Explicit valence for atom # 1 Si, 8, is greater than permitted
[10:55:00] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [5]:
#Removing Null Molecules
train = train[train['Molecule'].notnull()]
test = test[test['Molecule'].notnull()]

In [6]:
#Generating Features from Descriptors
method_list = [method_name for method_name in dir(Descriptors) if callable(getattr(Descriptors, method_name))]    
for method_name in tqdm.tqdm(method_list):
    try:
        method = getattr(Descriptors, method_name)
        smiles = train['SMILES'].tolist()
        molecular_weights = [method(Chem.MolFromSmiles(smile)) for smile in smiles]
        train[method_name] = molecular_weights
        smiles = test['SMILES'].tolist()
        molecular_weights = [method(Chem.MolFromSmiles(smile)) for smile in smiles]
        test[method_name] = molecular_weights
        
    except:
        continue


  train[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molec

1 items had no tests:
    __main__
0 tests in 1 items.
0 passed and 0 failed.
Test passed.


  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molecular_weights
  test[method_name] = molecular_weights
  train[method_name] = molec

In [9]:
#Storing the generated features into csv files
train.to_csv("train_400.csv",index=False)
test.to_csv("test_400.csv",index=False)

# Section-3: Load the Pre-Generated Features

In [10]:
#Loading the features that are generated
train = pd.read_csv("train_400.csv")
test = pd.read_csv("test_400.csv")

#applying a lambda function to the "_ChargeDescriptors" column to create two new columns in the "train_400" DataFrame, named "_ChargeDescriptors_1" and "_ChargeDescriptors_2"
train_400 = train.drop(["Id","SMILES","Molecule"],axis = 1)
train_400["_ChargeDescriptors_1"] = train_400._ChargeDescriptors.apply(lambda x: float(x[1:-1].split(",")[0]))
train_400["_ChargeDescriptors_2"] = train_400._ChargeDescriptors.apply(lambda x: float(x[1:-1].split(",")[1]))
train_400 = train_400.drop("_ChargeDescriptors",axis=1)


#Splitting the train_400 data:
x_train, x_test, y_train, y_test = train_test_split(train_400.drop("Expected",axis=1),train_400["Expected"], test_size=0.2,stratify=train_400["Expected"])
x_train.Assay_Id = x_train.Assay_Id.astype("float")
x_test.Assay_Id = x_test.Assay_Id.astype("float")
test.Assay_Id = test.Assay_Id.astype("float")

# Section-4: Feature Selection

In [11]:
#Using Recursive Feature Elimination with Cross Validation (RFECV) technique to select the optimal set of features for classification using LightGBM Classifier.
rfe = RFECV(LGBMClassifier(),cv=10,scoring="roc_auc",n_jobs=-1, step=50,verbose=3)
rfe.fit(x_train, y_train)

# An empty list selected_features is created to store the names of the selected features.
selected_features = []
for i,j in zip(x_train.columns, rfe.support_):
    if j == True:
        selected_features.append(i)

# This code creates a dictionary to store the mean values of columns in the x_train dataframe that have missing values. 
missing_impute_dict = {}
for col in selected_features:
    if x_train[col].isnull().mean() > 0:
        missing_impute_dict[col] = x_train[col].mean()
        x_train[col] = x_train[col].fillna(x_train[col].mean())


# These two lines of code create new dataframes x_train_rfe and y_train_rfe that contain the selected features and target data for the training set. 
x_train_rfe = x_train[selected_features].reset_index(drop=True)
y_train_rfe = y_train.reset_index(drop=True)

Fitting estimator with 404 features.
Fitting estimator with 354 features.
Fitting estimator with 304 features.
Fitting estimator with 254 features.


In [12]:
# Initializing a BorutaPy feature selector object using an LGBMClassifier as the base estimator
feat_selector = BorutaPy(LGBMClassifier(num_boost_round = 100), n_estimators='auto', verbose=0, random_state=1)
# Fit the feature selector to the training data
feat_selector.fit(x_train_rfe.values, y_train_rfe.values)

# An empty list selected_features is created to store the names of the selected features.
selected_features = []
for i,j in zip(x_train_rfe.columns, feat_selector.support_):
    if j == True:
        selected_features.append(i)



In [13]:
# Selecting the features from the training data that were determined to be important by the BorutaPy feature selector
x_train_boruta = x_train[selected_features].reset_index(drop=True)
y_train_boruta = y_train.reset_index(drop=True)

# Using the pickle module to dump the column names of the x_train_rfe DataFrame to the file
with open("rfe_features.pkl","wb") as f:
    pkl.dump(x_train_rfe.columns,f)

# Using the pickle module to dump the column names of the x_train_boruta DataFrame to the file
with open("boruta_features.pkl","wb") as f:
    pkl.dump(x_train_boruta.columns,f)
    

# Section-5: Load the Pre-Selected Features

In [14]:
#Using the pickle module to load the features
selected_features_rfe = pkl.load(open("rfe_features.pkl","rb"))
selected_features_boruta = pkl.load(open("boruta_features.pkl","rb"))

In [15]:
# This code creates a dictionary to store the mean values of columns in the x_train dataframe that have missing values. 
missing_impute_dict = {}
for col in selected_features_boruta:
    if x_train[col].isnull().mean() > 0:
        missing_impute_dict[col] = x_train[col].mean()
        x_train[col] = x_train[col].fillna(x_train[col].mean())

In [16]:
test_boruta = test[selected_features_boruta]
x_test_boruta = x_test[selected_features_boruta]
x_train_boruta = x_train[selected_features_boruta]

# Section-6: Modeling

In [17]:
#Model-1:
#Selecting the training data with only Toxicity Expected value “1” 
train_400_ones = train_400.loc[train_400.Expected==1]
#Selecting the training data with only Toxicity Expected value “2” 
train_400_twos = train_400.loc[train_400.Expected == 2][0:train_400_ones.shape[0]]
train_400_ones_twos = pd.concat([train_400_ones, train_400_twos],axis=0)

#Splitting the data for training
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(train_400_ones_twos.drop("Expected",axis=1),train_400_ones_twos["Expected"], test_size=0.1, stratify=train_400_ones_twos["Expected"], random_state=1000)
x_train_2.Assay_Id = x_train_2.Assay_Id.astype("float")
x_test_2.Assay_Id = x_test_2.Assay_Id.astype("float")
x_train_b = x_train_2[selected_features_boruta]

#Initializing an LGBMClassifier object with the specified hyperparameters
model_1 = LGBMClassifier(boosting_type="goss", n_estimators = 10000,class_weight = "balanced",max_depth=30, min_split_gain=0.6, 
                      importance_type="shap",reg_lambda = 0.2, num_leaves = 50,subsample_for_bin=500000, min_child_samples=20,
                     min_child_weight = 0.3, random_state=10000)
print("Model Creation")
model_1.fit(x_train_b, y_train_2)
print("Training Done")
pred=model_1.predict(test_boruta)
print("Model-1 Prediction Done")


Model Creation
Training Done
Model-1 Prediction Done


In [18]:
#Model-2:
#Selecting the training data with only Toxicity Expected value “1” 
train_400_ones = train_400.loc[train_400.Expected==1]
#Selecting the training data with only some Toxicity Expected value “2” 
train_400_twos = train_400.loc[train_400.Expected == 2][0:20]
train_400_ones_twos = pd.concat([train_400_ones, train_400_twos],axis=0)

#Splitting the data for training
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(train_400_ones_twos.drop("Expected",axis=1), train_400_ones_twos["Expected"], test_size=0.1, stratify=train_400_ones_twos["Expected"], random_state=1000)
x_train_2.Assay_Id = x_train_2.Assay_Id.astype("float")
x_test_2.Assay_Id = x_test_2.Assay_Id.astype("float")
x_train_a = x_train_2[selected_features_boruta]

#Initializing an LGBMClassifier object with the specified hyperparameters
model_2 = LGBMClassifier(boosting_type="goss", n_estimators = 10000,class_weight = "balanced",max_depth=30, min_split_gain=0.6, 
                      importance_type="shap",reg_lambda = 0.2, num_leaves = 50,subsample_for_bin=500000, min_child_samples=20,
                     min_child_weight = 0.3, random_state=10000)
print("Model Creation")
model_2.fit(x_train_a, y_train_2)
print("Training Done")
pred=model_2.predict(test_boruta)
print("Model-2 Prediction Done")

Model Creation
Training Done
Model-2 Prediction Done


In [19]:
#Model-3:
#Selecting the training data with only some Toxicity Expected value “1” 
train_400_ones = train_400.loc[train_400.Expected==1][0:100]
#Selecting the training data with only Toxicity Expected value “2” 
train_400_twos = train_400.loc[train_400.Expected == 2]
train_400_ones_twos = pd.concat([train_400_ones, train_400_twos],axis=0)

#Splitting the data for training
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(train_400_ones_twos.drop("Expected",axis=1), train_400_ones_twos["Expected"], test_size=0.2, stratify=train_400_ones_twos["Expected"], random_state =1000)
x_train_2.Assay_Id = x_train_2.Assay_Id.astype("float")
x_test_2.Assay_Id = x_test_2.Assay_Id.astype("float")
x_train_a = x_train_2[selected_features_boruta]
x_test_a = x_test_2[selected_features_boruta]

#Initializing an LGBMClassifier object with the specified hyperparameters
model_3= LGBMClassifier(boosting_type="goss", n_estimators = 10000,class_weight = "balanced",max_depth=30, min_split_gain=0.7, 
                      importance_type="shap",reg_lambda = 0.2, num_leaves = 50,subsample_for_bin=500000, min_child_samples=20,
                     min_child_weight = 0.2, random_state=10000)
print("Model Creation")
model_3.fit(x_train_boruta, y_train)
print("Training Done")
pred=model_3.predict(test_boruta) 
print("Mode-3 Prediction Done")

Model Creation
Training Done
Mode-3 Prediction Done


# Section-7: Result

In [20]:
# Predictions are made on the x_test_boruta data using the predict_proba method of three different models: model_1, model_2, and model_3.
# The first column of the predicted probabilities ([:,0]) is stored in the variables preds_1, preds_2, and preds_3.
preds_1 = model_1.predict_proba(x_test_boruta)[:,0]
preds_2 = model_2.predict_proba(x_test_boruta)[:,0]
preds_3 = model_3.predict_proba(x_test_boruta)[:,0]


# The predictions from each model are added to the DataFrame as columns with the names "model_1", "model_2", and "model_3".
# The actual values from the y_test data are added to the DataFrame as a column with the name "original".
preds_df = pd.DataFrame()
preds_df["model_1"] = preds_1
preds_df["model_2"] = preds_2
preds_df["model_3"] = preds_3
preds_df["original"] = y_test.values
preds_df

Unnamed: 0,model_1,model_2,model_3,original
0,0.041255,0.999036,0.092928,2
1,0.076272,0.998866,0.009817,2
2,0.000440,0.999605,0.000168,2
3,0.010314,0.999447,0.009509,2
4,0.001226,0.999657,0.000965,2
...,...,...,...,...
15071,0.000464,0.999459,0.002518,2
15072,0.270722,0.999616,0.752566,2
15073,0.448186,0.999588,0.757968,2
15074,0.000007,0.997006,0.000021,2


In [21]:
# Initializing a LogisticRegression object with a random state of 1000 for reproducibility
model_new = LogisticRegression(random_state=1000)
# Fitting the LogisticRegression model to the data in preds_df, using all columns except "actual" as features and "actual" as the target variable
model_new.fit(preds_df.drop("original",axis=1), preds_df.original)
# Using the fitted LogisticRegression model to make predictions on the data in preds_df, using all columns except "actual" as features
preds = model_new.predict(preds_df.drop("original", axis=1))

In [22]:
# Predictions are made on the test_boruta data using the predict_proba method of three different models: model_1, model_2, and model_3.
# The first column of the predicted probabilities ([:,0]) is stored in the variables preds_1, preds_2, and preds_3.
preds_1 = model_1.predict_proba(test_boruta)[:,0]
preds_2 = model_2.predict_proba(test_boruta)[:,0]
preds_3 = model_3.predict_proba(test_boruta)[:,0]


# The predictions from each model are added to the DataFrame as columns with the names "model_1", "model_2", and "model_3".
# The actual values from the y_test data are added to the DataFrame as a column with the name "original".
preds_df = pd.DataFrame()
preds_df["model_1"] = preds_1
preds_df["model_2"] = preds_2
preds_df["model_3"] = preds_3
preds_df

Unnamed: 0,model_1,model_2,model_3
0,0.844062,0.999556,0.415881
1,0.003884,0.999229,0.002847
2,0.014557,0.999349,0.008643
3,0.825623,0.999481,0.066118
4,0.421920,0.999334,0.544194
...,...,...,...
10989,0.340885,0.997729,0.386947
10990,0.018078,0.999365,0.037469
10991,0.002020,0.995790,0.000528
10992,0.592985,0.998912,0.210351


In [23]:
# Using the fitted LogisticRegression model to make predictions on the data in preds_df
preds = model_new.predict(preds_df)

# Creating an empty DataFrame to store the test data
test_df = pd.DataFrame()
test_df["ID"] = test.x
test_df["pred_proba"] = preds

# Creating a submission DataFrame with the required columns
submission_val=pd.DataFrame({'Id': test.x, 'Predicted': preds.astype('int')})
# Writing the submission DataFrame to a CSV file
submission_val.to_csv("Submission.csv",index=False)  
print("Output results created in csv file")


Output results created in csv file
