In [None]:

import numpy as np
import pandas as pd
import ast
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterSampler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score

from itertools import chain

sys.path.append(r"C:\\Users\\roeed\\OneDrive\\Documents\\Projects\\iml-hack-oncology")

from preprocessing.preprocess_data import preprocess
import preprocessing.data_completion as data_completion

import os

np.random.seed(42)
top_k = 3

# ---- 1. Import the data ----
father_folder = os.path.dirname(os.getcwd())
X = pd.read_csv(f'{father_folder}/train_test_splits/train_split.feats.csv')
y_raw = pd.read_csv(f'{father_folder}/train_test_splits/train_split.labels.0.csv')

X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')

# change the column name to 'metastasis' for consistency
y_raw.rename(columns={'אבחנה-Location of distal metastases': 'metastasis'}, inplace=True)

# Convert string to actual list
y_raw['metastasis'] = y_raw['metastasis'].apply(ast.literal_eval)

preprocess(X)

# Convert the 'metastasis' column to a list of unique labels
y_raw = y_raw['metastasis'].tolist()

# convert inner empty lists to list with the string 'None'
y_raw = [labels if labels else ['None'] for labels in y_raw]

  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')


In [138]:
possible_labels = list(set(chain.from_iterable(y_raw)))

# Binarize labels
mlb = MultiLabelBinarizer(classes=possible_labels)
y = mlb.fit_transform(y_raw)

# ---- 2. Train/test split ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- 3. Split the data into training and validation sets ----
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

In [None]:
# Define parameter grid for AdaBoostClassifier
param_dist_ada = {
    'base_estimator__n_estimators': np.arange(25, 201, 25),
    'base_estimator__learning_rate': np.linspace(0.01, 2.0, 20),
    'base_estimator__estimator__max_depth': [1, 2, 3, 4, 5, 6]
}

# Create base estimator
base_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(),
    random_state=42
)

# Create ClassifierChain
classifier_chain = ClassifierChain(base_ada, random_state=42)

# Set up random search for AdaBoost
random_search_ada = RandomizedSearchCV(
    estimator=classifier_chain,
    param_distributions=param_dist_ada,
    n_iter=20,
    cv=5,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search_ada.fit(X_train_val, y_train_val)
print(random_search_ada)


Fitting 5 folds for each of 15 candidates, totalling 75 fits
RandomizedSearchCV(cv=5,
                   estimator=ClassifierChain(base_estimator=AdaBoostClassifier(estimator=DecisionTreeClassifier(),
                                                                               random_state=42),
                                             random_state=42),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'base_estimator__estimator__max_depth': [1,
                                                                                 2,
                                                                                 3,
                                                                                 4,
                                                                                 5,
                                                                                 6],
                                        'base_estimator__learning_rate': array([0.01      , 0

In [140]:
print(f"Best parameters: {random_search_ada.best_params_}")
print(f"Best cross-validation score: {random_search_ada.best_score_:.4f}")

Best parameters: {'base_estimator__n_estimators': np.int64(100), 'base_estimator__learning_rate': np.float64(1.1621052631578948), 'base_estimator__estimator__max_depth': 5}
Best cross-validation score: 0.4974


In [159]:
# Train the final model with best parameters on the full training set
best_params_ada = random_search_ada.best_params_
print(f"Best parameters for AdaBoost: {best_params_ada}")
print(f"Best cross-validation score for AdaBoost: {random_search_ada.best_score_:.4f}")
# Train the final model with best parameters on full training set
best_ada = AdaBoostClassifier(
    n_estimators=best_params_ada['base_estimator__n_estimators'],
    learning_rate=best_params_ada['base_estimator__learning_rate'],
    estimator=DecisionTreeClassifier(
        max_depth=best_params_ada['base_estimator__estimator__max_depth']
    ),
    random_state=42
)

classifier_chain = ClassifierChain(best_ada, random_state=42)

# Fit the final model on the full training set
classifier_chain.fit(X_train, y_train)

# Predict on test/validation split
y_pred_ada = classifier_chain.predict(X_test)
# Print classification report
print("Final AdaBoost Classification report:")
print(classification_report(y_test, y_pred_ada, target_names=mlb.classes_))

Best parameters for AdaBoost: {'base_estimator__n_estimators': np.int64(100), 'base_estimator__learning_rate': np.float64(1.1621052631578948), 'base_estimator__estimator__max_depth': 5}
Best cross-validation score for AdaBoost: 0.4974
Final AdaBoost Classification report:
                   precision    recall  f1-score   support

    HEP - Hepatic       0.98      0.97      0.97       100
      BRA - Brain       1.00      1.00      1.00        10
 PER - Peritoneum       1.00      1.00      1.00         4
  PUL - Pulmonary       1.00      0.96      0.98        72
     PLE - Pleura       1.00      1.00      1.00         3
LYM - Lymph nodes       0.98      0.96      0.97        97
      BON - Bones       1.00      0.96      0.98       248
MAR - Bone Marrow       0.00      0.00      0.00         0
       SKI - Skin       1.00      1.00      1.00        14
             None       1.00      1.00      1.00      7530
   ADR - Adrenals       0.00      0.00      0.00         0
      OTH - Other 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Convert prediction vectors to lists of metastasis sites using mlb.classes_
def vectors_to_metastasis_lists(y_pred, mlb):
    return [
        [mlb.classes_[i] for i, val in enumerate(row) if int(val) == 1]
        for row in y_pred
    ]

# Example usage:
y_pred_ada_sites = vectors_to_metastasis_lists(y_pred_ada, mlb)
print("Unique inner lists in y_pred_ada_sites:")
print(set(tuple(sorted(labels)) for labels in y_pred_ada_sites))



Unique inner lists in y_pred_ada_sites:
{('BRA - Brain', 'None'), ('HEP - Hepatic', 'PUL - Pulmonary'), ('MAR - Bone Marrow',), ('BON - Bones', 'LYM - Lymph nodes', 'PUL - Pulmonary'), ('BON - Bones', 'LYM - Lymph nodes'), ('None', 'PER - Peritoneum'), ('SKI - Skin',), ('BON - Bones', 'PER - Peritoneum'), ('BON - Bones', 'HEP - Hepatic'), ('None', 'PLE - Pleura'), ('BON - Bones', 'HEP - Hepatic', 'LYM - Lymph nodes'), ('BON - Bones', 'PLE - Pleura'), ('BON - Bones', 'BRA - Brain'), ('OTH - Other', 'PUL - Pulmonary'), ('LYM - Lymph nodes', 'PUL - Pulmonary'), ('LYM - Lymph nodes',), ('MAR - Bone Marrow', 'None'), ('LYM - Lymph nodes', 'PER - Peritoneum', 'PUL - Pulmonary'), ('BON - Bones', 'HEP - Hepatic', 'MAR - Bone Marrow', 'PUL - Pulmonary'), ('HEP - Hepatic',), ('BON - Bones', 'MAR - Bone Marrow'), ('BON - Bones', 'PUL - Pulmonary'), ('None',), ('HEP - Hepatic', 'LYM - Lymph nodes'), ('BON - Bones',), ('BON - Bones', 'HEP - Hepatic', 'PUL - Pulmonary'), ('OTH - Other', 'SKI - Skin'

In [165]:
X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')
X_test_feats = preprocess(X_test_feats)

  X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],


In [None]:
# Predict on the test features
y_preds = classifier_chain.predict(X_test_feats)
# Convert prediction vectors to lists of metastasis sites using mlb.classes_
y_preds_sites = vectors_to_metastasis_lists(y_preds, mlb)
print("Unique inner lists in y_preds_sites:")

