# Model selection and hiperparamiters optimalisation

Zuzanna Gorczyca, zgo@kth.se

Alga Nour Elimane, nealga@kth.se

Tse An Shih, tashih@kth.se

### Imports

In [None]:
#SYSTEM STAFF
import numpy as np
import pandas as pd
import os
import sys
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

#SKLEARN TOOLS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score

## Data loading

In [2]:
print(sys.executable)

s:\KTH\Programing_for_data_science\.venv\Scripts\python.exe


In [3]:
dir_path = os.getcwd()
dir_path

's:\\KTH\\Programing_for_data_science\\Assignment4'

In [4]:
smoth_path = os.path.join(dir_path, "datasets", "training_smiles_processed_smoth.csv")
adysyn_path = os.path.join(dir_path, "datasets", "training_smiles_processed_adysyn.csv")
inbalance_path = os.path.join(dir_path, "datasets", "training_smiles_processed_norm_bins.csv")

In [5]:
smoth_data = pd.read_csv(smoth_path, index_col = 'INDEX')
smoth_data.head()

Unnamed: 0_level_0,ACTIVE,NoAtoms,CalcExactMolWt,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,...,MFp_114,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,9,9,9,1,2,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
1,0.0,1,2,1,0,1,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
2,0.0,9,9,9,0,3,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,8,8,8,1,2,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,2,2,2,1,3,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [8]:
adysyn_data = pd.read_csv(adysyn_path, index_col = 'INDEX')
adysyn_data.head()

Unnamed: 0_level_0,ACTIVE,NoAtoms,CalcExactMolWt,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,...,MFp_114,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,9,9,9,1,2,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
1,0.0,1,2,1,0,1,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
2,0.0,9,9,9,0,3,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,8,8,8,1,2,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,2,2,2,1,3,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [9]:
inbalance_data = pd.read_csv(inbalance_path, index_col = 'INDEX')
inbalance_data.head()

Unnamed: 0_level_0,NoAtoms,CalcExactMolWt,HeavyAtomCount,NumHDonors,NumHAcceptors,MFp_0,MFp_1,MFp_2,MFp_3,MFp_4,...,MFp_115,MFp_116,MFp_117,MFp_118,MFp_119,MFp_120,MFp_121,MFp_122,MFp_123,ACTIVE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,9,9,9,1,2,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2,1,2,1,0,1,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
3,9,9,9,0,3,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,8,8,8,1,2,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,2,2,2,1,3,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


## Chosen Classifiers

In this project we using 4 different classifiers:
- Random Forest
- Decision Tree
- Multi-Layer Perceptron
- Naive Bayes

In [10]:
Classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(50,),     
        activation='relu',
        alpha=0.0001,
        learning_rate_init=0.001,
        max_iter=200,                 
        early_stopping=True,          # stops automatically when no improvement
        n_iter_no_change=5,
        random_state=42
    ),
    "Naive Bayes": GaussianNB()
}

## Test different datasets: 

### Inbalance data

In [None]:
X_inbalance = inbalance_data.drop("ACTIVE", axis=1)  # features
y_inbalance = inbalance_data["ACTIVE"]               # label 

#splitting data 
X_train_inb, X_val_inb, y_train_inb, y_val_inb = train_test_split(
    X_inbalance, y_inbalance, test_size=0.2, random_state=42, stratify=y_inbalance)

In [12]:
#Training + computing AUC 

auc_scores_inb = {}

for name, clf in Classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_train_inb, y_train_inb)
    y_prob = clf.predict_proba(X_val_inb)[:, 1]
    auc = roc_auc_score(y_val_inb, y_prob)
    auc_scores_inb[name] = auc
    print(f"{name} AUC: {auc:.4f}\n")


# Selecting our best classifier 

best_clf_name = max(auc_scores_inb, key=auc_scores_inb.get)
print(f"Best classifier: {best_clf_name} with AUC = {auc_scores_inb[best_clf_name]:.4f}")
best_clf = Classifiers[best_clf_name]


Training Random Forest...
Random Forest AUC: 0.4995

Training Decision Tree...
Decision Tree AUC: 0.4952

Training MLP...
MLP AUC: 0.4992

Training Naive Bayes...
Naive Bayes AUC: 0.4998

Best classifier: Naive Bayes with AUC = 0.4998


Before assuming that MLP is the best model and since there is no much gap in auc score it's important to try to switch parameters and see if one of these models can top it 

### SMOTH data

In [13]:
X_smoth = smoth_data.drop("ACTIVE", axis=1)  # features
y_smoth = smoth_data["ACTIVE"]               # label 

#splitting data 
X_train_smoth, X_val_smoth, y_train_smoth, y_val_smoth = train_test_split(
    X_smoth, y_smoth, test_size=0.2, random_state=42, stratify=y_smoth)

In [14]:
auc_scores_smoth = {}

for name, clf in Classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_train_smoth, y_train_smoth)
    y_prob = clf.predict_proba(X_val_smoth)[:, 1]
    auc = roc_auc_score(y_val_smoth, y_prob)
    auc_scores_smoth[name] = auc
    print(f"{name} AUC: {auc:.4f}\n")


# Selecting our best classifier 

best_clf_name = max(auc_scores_smoth, key=auc_scores_smoth.get)
print(f"Best classifier: {best_clf_name} with AUC = {auc_scores_smoth[best_clf_name]:.4f}")
best_clf = Classifiers[best_clf_name]


Training Random Forest...
Random Forest AUC: 0.9871

Training Decision Tree...
Decision Tree AUC: 0.9258

Training MLP...
MLP AUC: 0.8769

Training Naive Bayes...
Naive Bayes AUC: 0.7659

Best classifier: Random Forest with AUC = 0.9871


### ADYSYN data

In [15]:
X_adysyn = adysyn_data.drop("ACTIVE", axis=1)  # features
y_adysyn = adysyn_data["ACTIVE"]               # label 

#splitting data 
X_train_adysyn, X_val_adysyn, y_train_adysyn, y_val_adysyn = train_test_split(
    X_adysyn, y_adysyn, test_size=0.2, random_state=42, stratify=y_adysyn)

In [16]:
auc_scores_adysyn = {}

for name, clf in Classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_train_adysyn, y_train_adysyn)
    y_prob = clf.predict_proba(X_val_adysyn)[:, 1]
    auc = roc_auc_score(y_val_adysyn, y_prob)
    auc_scores_adysyn[name] = auc
    print(f"{name} AUC: {auc:.4f}\n")


# Selecting our best classifier 

best_clf_name = max(auc_scores_adysyn, key=auc_scores_adysyn.get)
print(f"Best classifier: {best_clf_name} with AUC = {auc_scores_adysyn[best_clf_name]:.4f}")
best_clf = Classifiers[best_clf_name]


Training Random Forest...
Random Forest AUC: 0.9872

Training Decision Tree...
Decision Tree AUC: 0.9278

Training MLP...
MLP AUC: 0.8890

Training Naive Bayes...
Naive Bayes AUC: 0.7693

Best classifier: Random Forest with AUC = 0.9872


For inbalance data AUC ROC score is lower then 0.5 so we can consider that our results are worst then random classification. 

For data oversmapled by SMOTH method we get best AUC score for Random Forest algotithm and the second best is Decision Tree
ication. 

For data oversmapled by ADYSYN method we get best AUC score for Random Forest algotithm and the second best is Decision Tree 

## Model Selection + Hyperparameters Tuning on balanced Data 

We will optimalise hyperparamiters for SMOTH and ADYSYN datasets form Fandom Forest and Decision Tree algotithms.


### SMOTH dataset

#### Random Forest

In [None]:
# Random Forest 

print("Starting Random Forest tuning...")
rf_params = {
    "n_estimators": [50, 55, 100, 150, 200, 250]
}

rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(rf, rf_params, n_iter=6, cv=5, scoring="roc_auc", n_jobs=1, random_state=42, verbose=2)
rf_random.fit(X_smoth, y_smoth)
print("Best RF params:", rf_random.best_params_)
print("Best RF AUC:", rf_random.best_score_)


Starting Random Forest tuning...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ....................................n_estimators=50; total time=  59.6s
[CV] END ....................................n_estimators=50; total time= 1.0min
[CV] END ....................................n_estimators=50; total time=  56.6s
[CV] END ....................................n_estimators=50; total time=  58.3s
[CV] END ....................................n_estimators=50; total time=  55.9s
[CV] END ....................................n_estimators=55; total time= 1.0min
[CV] END ....................................n_estimators=55; total time= 1.1min
[CV] END ....................................n_estimators=55; total time= 1.0min
[CV] END ....................................n_estimators=55; total time= 1.0min
[CV] END ....................................n_estimators=55; total time= 1.0min
[CV] END ...................................n_estimators=100; total time= 1.9min
[CV] END .......

#### Decision Tree

In [None]:
print("\nStarting Decision Tree tuning...")
dt_params = {
    "max_depth": [None, 5, 10, 15, 20, 25]
}

dt = DecisionTreeClassifier(random_state=42)
dt_random = RandomizedSearchCV(dt, dt_params, n_iter=6, cv=5, scoring="roc_auc", n_jobs=-1, random_state=42, verbose=2)
dt_random.fit(X_smoth, y_smoth)
print("Best DT params:", dt_random.best_params_)
print("Best DT AUC:", dt_random.best_score_)


Starting Decision Tree tuning...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best DT params: {'max_depth': 5}
Best DT AUC: 0.5123893681441928


### mAyby later

In [None]:
print("\nStarting MLP tuning...")
mlp_params = {
    "hidden_layer_sizes": [(50,), (100,), (100,50)],
    "activation": ["relu", "tanh"],
    "alpha": [0.0001, 0.001],
    "learning_rate_init": [0.001, 0.01]
}

mlp = MLPClassifier(max_iter=300, random_state=42)
mlp_random = RandomizedSearchCV(
    mlp, mlp_params, n_iter=10, cv=3, scoring="roc_auc", n_jobs=-1,
    random_state=42, verbose=2
)
mlp_random.fit(X_train, y_train)
print("Best MLP params:", mlp_random.best_params_)
print("Best MLP AUC:", mlp_random.best_score_)


In [None]:
print("\nNaive bayes  ")
nb = GaussianNB()
nb.fit(X_train, y_train)
y_prob = nb.predict_proba(X_val)[:, 1]
print("Naive Bayes AUC:", roc_auc_score(y_val, y_prob))

### ADYSYN dataset

#### Random Forest

In [None]:
print("Starting Random Forest tuning...")
rf_params = {
    "n_estimators": [50, 55, 100, 150, 200, 250]
}

rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(rf, rf_params, n_iter=6, cv=5, scoring="roc_auc", n_jobs=1, random_state=42, verbose=2)
rf_random.fit(X_adysyn, y_adysyn)
print("Best RF params:", rf_random.best_params_)
print("Best RF AUC:", rf_random.best_score_)


#### Decision Tree

In [None]:
print("\nStarting Decision Tree tuning...")
dt_params = {
    "max_depth": [None, 5, 10, 15, 20, 25]
}
dt = DecisionTreeClassifier(random_state=42)
dt_random = RandomizedSearchCV(dt, dt_params, n_iter=6, cv=5, scoring="roc_auc", n_jobs=-1, random_state=42, verbose=2)
dt_random.fit(X_adysyn, y_adysyn)
print("Best DT params:", dt_random.best_params_)
print("Best DT AUC:", dt_random.best_score_)

## Final cross validation of our best models

Best models:
* Random Forest
* Decision Tree

Best datasets:
* SMOTH
* ADYSYN


### SMOTH dataset

#### Random Forest

In [None]:
print("FINAL CROSS-VALIDATION WITH RANDOM FOREST")
RF_smoth = RandomForestClassifier(n_estimators=50, random_state=42)

precision_scorer = make_scorer(precision_score, zero_division=0)

#  5-fold cross-validation on the FULL training set
cv_scores_RF_smoth = cross_validate(
    RF_smoth,
    X_smoth,  # Full training set 
    y_smoth,
    cv=5,  # 5-fold cross-validation
    scoring={
        'accuracy': 'accuracy',
        'roc_auc': 'roc_auc',
        'precision': precision_scorer
    },
    n_jobs=-1,
    return_train_score=True
)

# Display cross-validation results
print(f"\nCross-Validation Results (5-fold):")
print(f"  Accuracy:  {cv_scores_RF_smoth['test_accuracy'].mean():.4f} (+/- {cv_scores_RF_smoth['test_accuracy'].std():.4f})")
print(f"  ROC-AUC:   {cv_scores_RF_smoth['test_roc_auc'].mean():.4f} (+/- {cv_scores_RF_smoth['test_roc_auc'].std():.4f})")
print(f"  Precision: {cv_scores_RF_smoth['test_precision'].mean():.4f} (+/- {cv_scores_RF_smoth['test_precision'].std():.4f})")

#### Decision Tree

In [None]:
print("FINAL CROSS-VALIDATION WITH RANDOM FOREST")
DT_smoth = DecisionTreeClassifier(max_depth=5, random_state=42)

precision_scorer = make_scorer(precision_score, zero_division=0)

#  5-fold cross-validation on the FULL training set
cv_scores_DT_smoth = cross_validate(
    DT_smoth,
    X_smoth,  # Full training set 
    y_smoth,
    cv=5,  # 5-fold cross-validation
    scoring={
        'accuracy': 'accuracy',
        'roc_auc': 'roc_auc',
        'precision': precision_scorer
    },
    n_jobs=-1,
    return_train_score=True
)

# Display cross-validation results
print(f"\nCross-Validation Results (5-fold):")
print(f"  Accuracy:  {cv_scores_DT_smoth['test_accuracy'].mean():.4f} (+/- {cv_scores_DT_smoth['test_accuracy'].std():.4f})")
print(f"  ROC-AUC:   {cv_scores_DT_smoth['test_roc_auc'].mean():.4f} (+/- {cv_scores_DT_smoth['test_roc_auc'].std():.4f})")
print(f"  Precision: {cv_scores_DT_smoth['test_precision'].mean():.4f} (+/- {cv_scores_DT_smoth['test_precision'].std():.4f})")

### ADYSYN dataset

#### Random Forest

In [None]:
print("FINAL CROSS-VALIDATION WITH RANDOM FOREST")
RF_adysyn = RandomForestClassifier(n_estimators=50, random_state=42)

precision_scorer = make_scorer(precision_score, zero_division=0)

#  5-fold cross-validation on the FULL training set
cv_scores_RF_adysyn = cross_validate(
    RF_adysyn,
    X_adysyn,  # Full training set 
    y_adysyn,
    cv=5,  # 5-fold cross-validation
    scoring={
        'accuracy': 'accuracy',
        'roc_auc': 'roc_auc',
        'precision': precision_scorer
    },
    n_jobs=-1,
    return_train_score=True
)

# Display cross-validation results
print(f"\nCross-Validation Results (5-fold):")
print(f"  Accuracy:  {cv_scores_RF_adysyn['test_accuracy'].mean():.4f} (+/- {cv_scores_RF_adysyn['test_accuracy'].std():.4f})")
print(f"  ROC-AUC:   {cv_scores_RF_adysyn['test_roc_auc'].mean():.4f} (+/- {cv_scores_RF_adysyn['test_roc_auc'].std():.4f})")
print(f"  Precision: {cv_scores_RF_adysyn['test_precision'].mean():.4f} (+/- {cv_scores_RF_adysyn['test_precision'].std():.4f})")

#### Decision Tree

In [None]:
print("FINAL CROSS-VALIDATION WITH RANDOM FOREST")
DT_adysyn = DecisionTreeClassifier(max_depth=5, random_state=42)

precision_scorer = make_scorer(precision_score, zero_division=0)

#  5-fold cross-validation on the FULL training set
cv_scores_DT_adysyn = cross_validate(
    DT_adysyn,
    X_adysyn,  # Full training set 
    y_adysyn,
    cv=5,  # 5-fold cross-validation
    scoring={
        'accuracy': 'accuracy',
        'roc_auc': 'roc_auc',
        'precision': precision_scorer
    },
    n_jobs=-1,
    return_train_score=True
)

# Display cross-validation results
print(f"\nCross-Validation Results (5-fold):")
print(f"  Accuracy:  {cv_scores_DT_adysyn['test_accuracy'].mean():.4f} (+/- {cv_scores_DT_adysyn['test_accuracy'].std():.4f})")
print(f"  ROC-AUC:   {cv_scores_DT_adysyn['test_roc_auc'].mean():.4f} (+/- {cv_scores_DT_adysyn['test_roc_auc'].std():.4f})")
print(f"  Precision: {cv_scores_DT_adysyn['test_precision'].mean():.4f} (+/- {cv_scores_DT_adysyn['test_precision'].std():.4f})")

##MOdel performance on unbalanced Data 