In [129]:

import numpy as np
import pandas as pd
import ast
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterSampler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score

from itertools import chain

sys.path.append(r"C:\\Users\\roeed\\OneDrive\\Documents\\Projects\\iml-hack-oncology")

from preprocessing.preprocess_data import preprocess
import preprocessing.data_completion as data_completion

import os

np.random.seed(42)
top_k = 3

# ---- 1. Import the data ----
father_folder = os.path.dirname(os.getcwd())
X = pd.read_csv(f'{father_folder}/train_test_splits/train_split.feats.csv')
y_raw = pd.read_csv(f'{father_folder}/train_test_splits/train_split.labels.0.csv')

# change the column name to 'metastasis' for consistency
y_raw.rename(columns={'אבחנה-Location of distal metastases': 'metastasis'}, inplace=True)

# Convert string to actual list
y_raw['metastasis'] = y_raw['metastasis'].apply(ast.literal_eval)

preprocess(X)

# Convert the 'metastasis' column to a list of unique labels
y_raw = y_raw['metastasis'].tolist()

# convert inner empty lists to list with the string 'None'
y_raw = [labels if labels else ['None'] for labels in y_raw]

  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')


In [130]:
possible_labels = list(set(chain.from_iterable(y_raw)))

# Binarize labels
mlb = MultiLabelBinarizer(classes=possible_labels)
y = mlb.fit_transform(y_raw)

# ---- 2. Train/test split ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- 3. Split the data into training and validation sets ----
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

In [131]:
# Define hyperparameter space for AdaBoost
param_dist_ada = {
    'n_estimators': np.arange(25, 401, 25),
    'learning_rate': np.linspace(0.01, 2.0, 20),
    'estimator': [
        DecisionTreeClassifier(max_depth=d) for d in range(1, 6)
    ], 
}

# Set up random search for AdaBoost
random_search_ada = ClassifierChain(RandomizedSearchCV(
    estimator=AdaBoostClassifier(random_state=42),
    param_distributions=param_dist_ada,
    n_iter=15,
    cv=5,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1,
    random_state=42
), order='random', random_state=42)



In [None]:
# Fit the best model on the entire training set
random_search_ada.fit(X_train, y_train)
print(random_search_ada)

In [None]:
# Predict on the test set
y_pred_ada = random_search_ada.predict(X_test)
# Print classification report
print("Final AdaBoost Classification report:")
print(classification_report(y_test, y_pred_ada, target_names=mlb.classes_))