In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
#Only CAtBoost

import pandas as pd
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier

# Fixed random seed
RANDOM_SEED = 42

# Load dataset
data = pd.read_csv('/content/drive/Othercomputers/My Laptop/Paper/06 _Thyroid_Cancer/Dataset/Thyroid_Diff.csv')
shuffled_data = data.sample(frac=1, random_state=42)

nfolds = 5
train_size = int(0.7 * len(shuffled_data))
test_size = len(shuffled_data) - train_size
shift = int(1 / nfolds * len(shuffled_data))
total_rows = len(shuffled_data)

# Define train/test folds
test_sets, train_sets = [], []
for i in range(nfolds):
    start_idx = i * shift
    end_idx = start_idx + test_size
    test_set = pd.concat([
        shuffled_data.iloc[start_idx:],
        shuffled_data.iloc[:end_idx % total_rows]
    ]).head(test_size)
    test_sets.append(test_set)
    train_sets.append(shuffled_data.drop(test_set.index))


# Model definition
model_name = "CatBoost"
model = CatBoostClassifier(random_state=RANDOM_SEED, verbose=False)

# Store results
results_summary = []

print(f"\n=== Evaluating {model_name} ===")
combined_fold_info = []

for fold in range(5):
    # Load train and test sets
    trainset, testset = train_sets[fold], test_sets[fold]

    # Encode categorical features
    categorical_columns = trainset.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        le = LabelEncoder()
        trainset[col] = le.fit_transform(trainset[col])
        testset[col] = le.transform(testset[col])

    # Split features and target
    X_train, Y_train = trainset.drop(['Recurred'], axis=1), trainset['Recurred']
    X_test, Y_test = testset.drop(['Recurred'], axis=1), testset['Recurred']

    # Train
    start_time = time.time()
    model.fit(X_train, Y_train)
    train_time = (time.time() - start_time) * 1000  # ms

    # Predict
    start_time = time.time()
    preds_test = model.predict(X_test)
    test_time = (time.time() - start_time) * 1000  # ms

    # Collect fold results
    current_fold_info = {
        'Fold': fold + 1,
        'Train Accuracy': accuracy_score(Y_train, model.predict(X_train)) * 100,
        'Train F-score': f1_score(Y_train, model.predict(X_train)) * 100,
        'Train Precision': precision_score(Y_train, model.predict(X_train)) * 100,
        'Train Recall': recall_score(Y_train, model.predict(X_train)) * 100,
        'Test Accuracy': accuracy_score(Y_test, preds_test) * 100,
        'Test F-score': f1_score(Y_test, preds_test) * 100,
        'Test Precision': precision_score(Y_test, preds_test) * 100,
        'Test Recall': recall_score(Y_test, preds_test) * 100,
    }
    combined_fold_info.append(current_fold_info)

# Convert to DataFrame
combined_df = pd.DataFrame(combined_fold_info)

# Mean metrics
mean_metrics = combined_df[['Train Accuracy', 'Train F-score', 'Train Precision', 'Train Recall',
                            'Test Accuracy', 'Test F-score', 'Test Precision', 'Test Recall']].mean()

mean_metrics["Model"] = model_name
results_summary.append(mean_metrics)

# Final summary
final_results_df = pd.DataFrame(results_summary).set_index("Model")

print("\n=== Final Mean Performance ===")
final_results_df



=== Evaluating CatBoost ===

=== Final Mean Performance ===


Unnamed: 0_level_0,Train Accuracy,Train F-score,Train Precision,Train Recall,Test Accuracy,Test F-score,Test Precision,Test Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CatBoost,99.701493,99.475257,100.0,98.957355,95.826087,92.4154,96.285435,89.269247


In [4]:
#Optimized Models

import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import shap

# Dictionary of model configs
model_configs = {

    'EVO': {
        'Selected_Index': [4, 8, 9, 10, 12, 15, 16],
        'depth': 5,
        'learning_rate': 0.0816602590945627,
        'iterations': 134,
        'l2_leaf_reg': 2.31620884051237
    },
    'EO': {
        'Selected_Index': [5, 10, 12, 14, 15, 16],
        'depth': 3,
        'learning_rate': 0.475142295243478,
        'iterations': 109,
        'l2_leaf_reg': 5.43716953821191
    },
    'EFO': {
        'Selected_Index': [0, 1, 2, 3, 6, 10, 12, 13, 15, 16],
        'depth': 3,
        'learning_rate': 0.389825271452862,
        'iterations': 143,
        'l2_leaf_reg': 7.9763825368893
    }
}

# Load dataset
data = pd.read_csv('/content/drive/Othercomputers/My Laptop/Paper/06 _Thyroid_Cancer/Dataset/Thyroid_Diff.csv')
shuffled_data = data.sample(frac=1, random_state=42)

nfolds = 5
train_size = int(0.7 * len(shuffled_data))
test_size = len(shuffled_data) - train_size
shift = int(1 / nfolds * len(shuffled_data))
total_rows = len(shuffled_data)

# Define train/test folds
test_sets, train_sets = [], []
for i in range(nfolds):
    start_idx = i * shift
    end_idx = start_idx + test_size
    test_set = pd.concat([
        shuffled_data.iloc[start_idx:],
        shuffled_data.iloc[:end_idx % total_rows]
    ]).head(test_size)
    test_sets.append(test_set)
    train_sets.append(shuffled_data.drop(test_set.index))

# Final results
all_models_means = []

# Loop through each model config
for model, cfg in model_configs.items():
    Five_fold_Result = []

    for fold in range(5):
        trainset, testset = train_sets[fold], test_sets[fold]

        train_selected = trainset.iloc[:, cfg['Selected_Index']]
        test_selected = testset.iloc[:, cfg['Selected_Index']]

        X_train = train_selected.drop(['Recurred'], axis=1)
        Y_train = train_selected['Recurred']
        X_test = test_selected.drop(['Recurred'], axis=1)
        Y_test = test_selected['Recurred']

        Categorical_features = list(X_train.select_dtypes(include=['object']).columns)

        best_model = CatBoostClassifier(
            depth=cfg['depth'],
            learning_rate=cfg['learning_rate'],
            iterations=cfg['iterations'],
            l2_leaf_reg=cfg['l2_leaf_reg'],
            verbose=False,
            random_seed=42,
            eval_metric='F1',
            loss_function='Logloss',
            cat_features=Categorical_features
        )
        best_model.fit(X_train, Y_train)

        # Predictions
        Y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(Y_test, Y_pred)
        precision = precision_score(Y_test, Y_pred, pos_label='Yes')
        recall = recall_score(Y_test, Y_pred, pos_label='Yes')
        f1 = f1_score(Y_test, Y_pred, pos_label='Yes')
        #Train Performamnce
        Y_pred_train = best_model.predict(X_train)
        accuracy_train = accuracy_score(Y_train, Y_pred_train)
        f1_train = f1_score(Y_train, Y_pred_train, pos_label='Yes')
        precision_train = precision_score(Y_train, Y_pred_train, pos_label='Yes')
        recall_train = recall_score(Y_train, Y_pred_train, pos_label='Yes')

        Five_fold_Result.append({
            'Model': model,
            'fold': fold + 1,
            'Train Accuracy': accuracy_train * 100,
            'Train F1 Score': f1_train * 100,
            'Train Precision': precision_train * 100,
            'Train Recall': recall_train * 100,
            'Test Accuracy': accuracy * 100,
            'Test F1 Score': f1 * 100,
            'Test Precision': precision * 100,
            'Test Recall': recall * 100
        })

    # Convert to DF and take mean
    Five_fold_Result_DF = pd.DataFrame(Five_fold_Result)
    Five_fold_Result_DF_mean = Five_fold_Result_DF[['Train Accuracy','Train F1 Score','Train Precision','Train Recall','Test Accuracy','Test F1 Score','Test Precision','Test Recall']].mean()
    Five_fold_Result_DF_mean = Five_fold_Result_DF_mean.to_frame().T
    Five_fold_Result_DF_mean=Five_fold_Result_DF_mean.round(2)
    Five_fold_Result_DF_mean.insert(0, "Model", model)

    all_models_means.append(Five_fold_Result_DF_mean)

# Concatenate all models' mean results
Final_Five_Fold_Result_DF = pd.concat(all_models_means, ignore_index=True)
Final_Five_Fold_Result_DF


Unnamed: 0,Model,Train Accuracy,Train F1 Score,Train Precision,Train Recall,Test Accuracy,Test F1 Score,Test Precision,Test Recall
0,EVO,96.64,93.82,97.44,90.47,96.35,93.34,96.19,90.94
1,EO,97.01,94.65,96.0,93.39,96.17,93.12,94.31,92.21
2,EFO,97.61,95.63,97.78,93.59,96.17,93.09,95.78,91.15
