In [None]:
#to add
#remove correlated features
#show that train and test distributions are similar
#fix random seed
#permutation importance/SHAP?
#measure diversity?

In [None]:
RANDOM_SEED = 1147

In [None]:
!pip install pandas openpyxl numpy

In [None]:
import pandas as pd
import numpy as np
# !pip install openpyxl
df = pd.read_excel('mp_dataset_initial.xlsx') #call from within GitHub repository

In [None]:
df.shape

In [None]:
df.columns #show what features there are

In [None]:
df.info() # 17 columns

In [None]:
df.describe() #data statistics

In [None]:
df.head(10) #first 10 rows of data

**Dropping Source link Column - No predictive value**

In [None]:
df = df.drop(columns = 'DOI')
print(f"After dropping DOI column: the number of feature columns are {df.shape[1]}")

# **Null Values**

In [None]:
#want to remove columns with null values
print(f"The number of nulls in df: {df.isna().sum().sum()}")

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
# Drop columns with too many missing values - 4 columns dropped
df = df.drop(columns=['PDI', 'Polymer Mn',"Polymer Mw","Polymer Molecular Weight (unit not specified)"])
print(f"After dropping columns: the number of columns are {df.shape[1]}")

# Duplicates

In [None]:
print(f"Number of duplicated rows: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True) # dropping duplicates

# **Zero variance columns**

In [None]:
zero_var_cols = df.columns[df.nunique() == 1].tolist()
print("Zero-variance columns:", zero_var_cols)


In [None]:
df.info()

In [None]:
df.head(11)

# **Canonicalization**

In [None]:
#!pip install rdkit
# !pip install rdkit-pypi
from rdkit import Chem # module chem
from rdkit.Chem import Draw, Descriptors

In [None]:
def canonicalize_smiles(smiles, isomeric:bool=True): # isomeric is flag/true or false, true is default
    """
    Convert any SMILES to its canonical form.

    Args:
        smiles (str): Input SMILES string

    Returns:
        str: Canonical SMILES
    """
    mol = Chem.MolFromSmiles(smiles) # creating mol object
    if mol is None:
        return None  # Invalid SMILES
    return Chem.MolToSmiles(mol, isomericSmiles=isomeric)

In [None]:
df['Drug canonical_smiles'] = df['Drug SMILES'].apply(canonicalize_smiles)

In [None]:
df.head()

In [None]:
from rdkit.Chem import PandasTools # module from rdkit.Chem

# Add a column called 'molecule' with mol objects
PandasTools.AddMoleculeColumnToFrame(df, # AddMoleculeColumnToFrame is a function inside PandasTools module
                                     smilesCol='Drug canonical_smiles', # input column
                                     molCol='Drug molecule' # output column - new column created
                                     ) # creates mol object from canonical smiles column

# df['molecule'] = df['canonical_smiles'].apply(Chem.MolFromSmiles) # the previous line is same as this
df.head()

In [None]:
# Define a function to calculate key molecular descriptors
def calculate_key_descriptors(mol):
    """Calculate key molecular descriptors for a molecule."""

    descriptors = {
        'Drug MW': Descriptors.MolWt(mol),
        'Drug LogP': Descriptors.MolLogP(mol), # Lipophilicity (octanol-water partition coefficient)
        'Drug TPSA': Descriptors.TPSA(mol), # Topological Polar Surface Area (important for drug absorption)
        # 'Drug HBD': Descriptors.NumHDonors(mol),
        # 'Drug HBA': Descriptors.NumHAcceptors(mol)
    }

    return descriptors

# Apply the function to the molecules in the DataFrame
descriptor_data = []
for idx, mol in df['Drug molecule'].items(): # index is row index, mol is molecule column value, molecule is mol object, it is pd series, index value pair
    desc = calculate_key_descriptors(mol) # dictionary created for each sample molecule
    descriptor_data.append(desc)

# Create descriptor DataFrame with pd.DataFrame()
# We use the same index as the original DataFrame to maintain row alignment
descriptor_df = pd.DataFrame(descriptor_data, index=df.index) # new dataframe created with descriptor columns

# Combine with the original molecules DataFrame
df_with_descriptors = pd.concat([df, descriptor_df], axis=1) # axis = 1, side by side concatenation

# Check the new dimensions - we should have the same number of rows but more columns
print(f"Dataset shape: {df_with_descriptors.shape}")
df_with_descriptors.head()

# Correlation analysis

Formulation Method column has 4 unique values - it will be one hot encoded - 3 new features added

Drug column is redundant as there are columns such as Drug MW, TPSA and LogP explains it

There will be 11 features, one target and 3 new columns for one hot encoded

## Pearson

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select all numeric columns
num_cols = ['LA/GA', 'Initial Drug-to-Polymer Ratio', 'Particle Size', 'Drug Loading Capacity', 'Drug Encapsulation Efficiency', 'Solubility Enhancer Concentration', 'Time', 'Release',"Drug MW", "Drug LogP","Drug TPSA"]
# num_cols = df.select_dtypes(include=[np.number]).columns
# print(f"Numeric columns ({len(num_cols)}): {list(num_cols)}")
# print(f"Not included columns are {[col for col in df.columns if col not in num_cols]}")

# One-hot encode categorical columns
cat_cols = ['Formulation Method']
encoded = pd.get_dummies(df_with_descriptors[cat_cols], drop_first=True)

# Combine numeric + encoded categorical features
df_corr = pd.concat([df_with_descriptors[num_cols], encoded], axis=1)

correlation_matrix = df_corr.corr()


# Plot correlation heatmap
plt.figure(figsize=(12, 10))

# Mask the upper triangle (for cleaner plot)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(
    correlation_matrix,
    mask=mask,
    annot=True,
    cmap='coolwarm',
    center=0,
    square=True,
    fmt='.2f',
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient'}
)

plt.title('Pearson Correlation Heatmap for All Numerical Features', fontweight='bold', pad=20)
plt.tight_layout()


plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()



In Pearson correlation - linear correlations are captured

Highest correlations between features are in

Drug loading capacity - Initial drug to polymer ratio 0.87

Drug TPSA and MW 0.87

Drug Encapsulation Efficiency and Drug Loading Capacity 0.48

Highest correlation with target variable release is time 0.43



## Spearman

In [None]:
correlation_matrix = df_corr.corr(method='spearman')

# Plot correlation heatmap
plt.figure(figsize=(12, 10))

# Mask the upper triangle (for cleaner plot)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(
    correlation_matrix,
    mask=mask,
    annot=True,
    cmap='coolwarm',
    center=0,
    square=True,
    fmt='.2f',
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient'}
)

plt.title('Spearman Correlation Heatmap for All Numerical Features', fontweight='bold', pad=20)
plt.tight_layout()

In Spearman correlations non-linear correlations are also captured, monotonic relationships are captured

Highest correlated features are

Drug loading capacity and drug to polymer ratio id 0.9

Drug TPSA and MW correlation is 0.6

Drug encapsulation efficiency and drug loading capacity correlation is 0.51

Drug LogP and Solubility Enhancer Concentration correlation is 0.45

Release and time correlatio is 0.67


In [None]:
df_with_descriptors["Formulation Method"].value_counts()

# Outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
sns.boxplot(data=df_with_descriptors[num_cols], orient='h')
# plt.title("Outlier Detection Using Boxplots")

plt.savefig("outlier_detection.png", dpi=300, bbox_inches='tight')
plt.show()


# Modelling

## Regression

In [None]:
df_corr #show current state of data frame

In [None]:

TARGET = "Release"
# FEATURES = [f for f in num_cols if f != TARGET]

FEATURES = [col for col in df_corr.columns if col != TARGET]

In [None]:
df.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_corr[FEATURES], df_corr[TARGET], test_size=0.2, random_state=RANDOM_SEED
)

In [None]:
def get_regression_metrics(model, X, y_true):
    """
    Get a dicionary with regression metrics:

    model: sklearn model with predict method
    X: feature matrix
    y_true: ground truth labels
    """
    y_predicted = model.predict(X)

    # mae = sum(abs(y_true-y_predicted))/X.shape[0]
    mae = np.mean(np.abs(y_true - y_predicted))
    mse = np.mean(np.square(y_true-y_predicted))
    maximum_error = np.max(np.abs(y_true-y_predicted))
    r2 = r2_score(y_true, y_predicted)
    mape = np.mean(
        np.abs(
            (y_true-y_predicted)/np.maximum(0.0001,np.abs(y_true))
            ))*100
    metrics_dict = {
        "r2":r2,
        'mse': mse,
        'mae': mae,
        'max_error': maximum_error,
        'mape': mape
    }

    return metrics_dict

In [None]:
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
from sklearn.metrics import r2_score
# Train the model on the training data
linear_regressor.fit(X_train,y_train)

linear_regressor_results_train = get_regression_metrics(linear_regressor,X_train,y_train )
linear_regressor_results_test = get_regression_metrics(linear_regressor, X_test,y_test )

# Print the results
print("Linear Regression Results - Train Set:", linear_regressor_results_train)
print("Linear Regression Results - Test Set:", linear_regressor_results_test)

In [None]:
df['Release'].describe()
plt.hist(df['Release'], bins=30) # many samples close to 0, MAPE exploding


## KRR

In [None]:
from sklearn.kernel_ridge import KernelRidge

krr = KernelRidge(kernel='rbf') # without regularization
krr.fit(X_train,y_train)
# get the metrics on the train and the test set using the get_regression_metrics functions (as above)
krr_results_train = get_regression_metrics(krr, X_train,y_train )
krr_results_test = get_regression_metrics(krr, X_test,y_test )



# Print the results
print("KRR Results - Train Set:", krr_results_train)
print("KRR Results - Test Set:", krr_results_test)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
pipe_w_scaling = Pipeline(
   [
       ('scaling', StandardScaler()),
       ('krr', KernelRidge(kernel="rbf"))
   ]
)
pipe_w_scaling.fit(X_train,y_train)

pipeline_results_train = get_regression_metrics(pipe_w_scaling, X_train, y_train )
pipeline_results_test = get_regression_metrics(pipe_w_scaling, X_test,y_test )



# # Print the results
print("Pipeline Results - Train Set:", pipeline_results_train)
print("Pipeline Results - Test Set:", pipeline_results_test)

KRR performed way better because it added non-linearity

## KRR with Regularization and Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

# Define possible scalers
scalers = [StandardScaler(), MinMaxScaler(), None]

# Define pipeline placeholder (scaler will be set dynamically)
pipe = Pipeline([
    ('scaling', StandardScaler()),  # dummy; will be overridden in param_grid
    ('krr', KernelRidge(kernel='rbf'))
])

# Define parameter grid
param_grid = [
    {
        'scaling': scalers,
        'krr__alpha': [0.001, 0.01, 0.1, 1],
        'krr__gamma': [0.01, 0.1, 1, 10]
    }
]

# Grid search with 5-fold CV
search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

search.fit(X_train, y_train)

# Print best result
print("Best Params:", search.best_params_)
print("Best CV RÂ²:", search.best_score_)


In [None]:
best_krr = search.best_estimator_
best_krr.fit(X_train, y_train)

train_results = get_regression_metrics(best_krr, X_train, y_train)
test_results  = get_regression_metrics(best_krr, X_test,  y_test)

print("Tuned KRR - Train:", train_results)
print("Tuned KRR - Test :", test_results)


## XGBOOST

In [None]:
from xgboost import XGBRegressor

# XGBOOST expects floats
X_tr = X_train.to_numpy(dtype=float)
y_tr = y_train.to_numpy(dtype=float)
X_te = X_test.to_numpy(dtype=float)
y_te = y_test.to_numpy(dtype=float)

xgb = XGBRegressor(
    n_estimators=1200,        # number of trees
    learning_rate=0.04,       # how much each tree contributes
    # max_depth=5,              # trees depth
    # min_child_weight=12,      # larger leaves (regularization)
    gamma=0.6,                # split penalty
    # subsample=0.7,            # row sampling
    # colsample_bytree=0.6,     # feature sampling
    reg_alpha=1.0,            # L1
    reg_lambda=6.0,           # L2
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=4,
    random_state=RANDOM_SEED,
    verbosity=0,
    max_depth=4,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8
)

xgb.fit(X_tr, y_tr)

print("XGBoost Results - Train Set:", get_regression_metrics(xgb, X_tr, y_tr))
print("XGBoost Results - Test Set:",  get_regression_metrics(xgb, X_te, y_te))

## Hypertuned XGBOOST

In [None]:
param_grid = {
    "n_estimators": [500, 800, 1200],
    "learning_rate": [0.01, 0.03, 0.05],
    "max_depth": [3, 4, 5],
    "min_child_weight": [3, 6, 12],
    "gamma": [0, 0.3, 0.6],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.6, 0.8],
    "reg_alpha": [0, 1, 5],
    "reg_lambda": [1, 6, 10]
}

from sklearn.model_selection import RandomizedSearchCV

xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    random_state=1147,
    n_jobs=4
)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=30,               # try 30 combinations
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=1,
    random_state=42
)

random_search.fit(X_tr, y_tr)
print("Best params:", random_search.best_params_)
print("Best score:", -random_search.best_score_)


In [None]:
best_params = random_search.best_params_
best_xgb = XGBRegressor(
    **best_params,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=1147,
    n_jobs=4
)

best_xgb.fit(X_tr, y_tr)
print("Train:", get_regression_metrics(best_xgb, X_tr, y_tr))
print("Test :", get_regression_metrics(best_xgb, X_te, y_te))
