# Parameter Selection

After evaluating various models, we will now focus on enhancing their performance through parameter tuning.

In [2]:
import numpy as np
import pandas as pd

import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

As usual, we will start our tests by initializing a pandas DataFrame using the data from the `lidc_rfe50.csv` file. This file contains a combined set of radiomics and deep features after applying RFE to retain 50 columns.

In [3]:
df = pd.read_csv(r'../2_csv_manipulation/lidc_rfe50.csv')

Once again, we will be using the binary transformation of the dataset (ambiguity removed) to quickly evaluate different parameters for each model:

In [4]:
# Removes ambiguous option
df_rem = df.copy()
before = df_rem.shape[0] # n_linhas antes

mask = df_rem['is_cancer'] == 1
df_rem = df_rem[~mask]
after = df_rem.shape[0] # n_linhas depois

df_rem['is_cancer'] = df_rem['is_cancer'].replace(2, 1)
print(f"Row size changed from {before} to {after} (lost {before-after} rows).")

X = df_rem.drop('is_cancer', axis=1)
y = df_rem['is_cancer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Row size changed from 2626 to 1238 (lost 1388 rows).


Since we will be utilizing SMOTE in our final report, hyperparameter tuning will be conducted after applying SMOTE (helps avoid majority class bias).

In [5]:
smote_sampler = SMOTE()
X_train, y_train = smote_sampler.fit_resample(X_train, y_train)

We used Grid Search for most of our models, ocasionally applying Randomized Search in the cases where the hyperparameter space could become complex, due to the nature of the model itself (Random Forest, XGBoost).


This will help us efficiently combine all of them in the stacking method outlined in the final `report.ipynb`.

### Logistic Regression


In [6]:
logistic_model = LogisticRegression(max_iter=900)
logistic_params = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga']
}
logistic_search = GridSearchCV(logistic_model, logistic_params, cv=5, scoring='accuracy', n_jobs=-1)

### Random Forest


In [7]:
rf_model = RandomForestClassifier(class_weight='balanced')
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_search = RandomizedSearchCV(rf_model, rf_params, cv=5, scoring='accuracy', n_jobs=-1, n_iter=50, random_state=42)

### XGBoost


In [8]:
xgb_model = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}
xgb_search = RandomizedSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy', n_jobs=-1, n_iter=50, random_state=42)

### SVM


In [9]:
svm_model = SVC(kernel='rbf')
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm_search = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy', n_jobs=-1)

### Gaussian Naive Bayes


In [10]:
gnb_model = GaussianNB()
gnb_params = {
    'var_smoothing': np.logspace(-9, -1, 9)
}
gnb_search = GridSearchCV(gnb_model, gnb_params, cv=5, scoring='accuracy', n_jobs=-1)

## Performing Grid and Randomized Searches

In [11]:
# Dictionary to store best models
best_models = {}

# List of (model name, search object) tuples for easy iteration
model_searches = [
    ("Logistic Regression", logistic_search),
    ("Random Forest", rf_search),
    ("XGBoost", xgb_search),
    ("SVM", svm_search),
    ("Gaussian Naive Bayes", gnb_search)
]

# Run search for each model
for model_name, search in model_searches:
    print(f"Testing parameters for {model_name}...")
    search.fit(X_train, y_train)
    best_models[model_name] = search.best_estimator_
    print(f"Best parameters for {model_name} complete. [{search.best_params_}] model stored in dictionary.\n")

Testing parameters for Logistic Regression...
Best parameters for Logistic Regression complete. [{'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}] model stored in dictionary.

Testing parameters for Random Forest...
Best parameters for Random Forest complete. [{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}] model stored in dictionary.

Testing parameters for XGBoost...
Best parameters for XGBoost complete. [{'subsample': 0.9, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.9}] model stored in dictionary.

Testing parameters for SVM...
Best parameters for SVM complete. [{'C': 10, 'gamma': 1, 'kernel': 'rbf'}] model stored in dictionary.

Testing parameters for Gaussian Naive Bayes...
Best parameters for Gaussian Naive Bayes complete. [{'var_smoothing': 1e-09}] model stored in dictionary.



## Performance Evaluation

In [12]:
def missclass_percent(y_true, y_pred):
    
    cm = confusion_matrix(y_true, y_pred)
    misclassification_per_class = {}
    for i in range(cm.shape[0]):  # Iterate over each class
        total_in_class = cm[i].sum()
        if total_in_class > 0:  # Avoid division by zero
            misclassified = total_in_class - cm[i, i]
            misclassification_percentage_per_class = (misclassified / total_in_class) * 100
            misclassification_per_class[i] = misclassification_percentage_per_class

    return  misclassification_per_class


# Evaluate each optimized model
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    
    misclassification_per_class = missclass_percent(y_test, y_pred)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\n{model_name} performance:\n")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # AUC only available in binary classification
    if len(np.unique(y_test)) == 2:
        roc_auc = roc_auc_score(y_test, y_pred)
        print(f"ROC-AUC: {roc_auc:.4f}")
    
    print()
    for class_label, percentage in misclassification_per_class.items():
        print(f"Misclassification Percentage for Class {class_label}: {percentage:.2f}%")
    
    print("\n--------------------------\n")



Logistic Regression performance:

Accuracy: 0.8683
F1 Score: 0.8706
ROC-AUC: 0.8772

Misclassification Percentage for Class 0: 15.10%
Misclassification Percentage for Class 1: 9.45%

--------------------------


Random Forest performance:

Accuracy: 0.8602
F1 Score: 0.8614
ROC-AUC: 0.8541

Misclassification Percentage for Class 0: 12.65%
Misclassification Percentage for Class 1: 16.54%

--------------------------


XGBoost performance:

Accuracy: 0.8522
F1 Score: 0.8535
ROC-AUC: 0.8460

Misclassification Percentage for Class 0: 13.47%
Misclassification Percentage for Class 1: 17.32%

--------------------------


SVM performance:

Accuracy: 0.8333
F1 Score: 0.8342
ROC-AUC: 0.8204

Misclassification Percentage for Class 0: 13.88%
Misclassification Percentage for Class 1: 22.05%

--------------------------


Gaussian Naive Bayes performance:

Accuracy: 0.8226
F1 Score: 0.8271
ROC-AUC: 0.8501

Misclassification Percentage for Class 0: 23.67%
Misclassification Percentage for Class 1: 6.30%

In [18]:
# Obtaining best parameters for each model
with open('parameters.txt', 'w') as file:
    for model_name, params in best_models.items():
        file.write(f"{model_name}: {params}\n\n")