In [None]:

import numpy as np
import pandas as pd
import ast
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterSampler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score

from itertools import chain

sys.path.append(r"C:\\Users\\roeed\\OneDrive\\Documents\\Projects\\iml-hack-oncology")

from preprocessing.preprocess_data import preprocess
import preprocessing.data_completion as data_completion

import os

np.random.seed(42)
top_k = 3

# ---- 1. Import the data ----
father_folder = os.path.dirname(os.getcwd())
X = pd.read_csv(f'{father_folder}/train_test_splits/train_split.feats.csv')
y_raw = pd.read_csv(f'{father_folder}/train_test_splits/train_split.labels.0.csv')

X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')

# change the column name to 'metastasis' for consistency
y_raw.rename(columns={'אבחנה-Location of distal metastases': 'metastasis'}, inplace=True)

# Convert string to actual list
y_raw['metastasis'] = y_raw['metastasis'].apply(ast.literal_eval)

preprocess(X)

# Convert the 'metastasis' column to a list of unique labels
y_raw = y_raw['metastasis'].tolist()

# convert inner empty lists to list with the string 'None'
y_raw = [labels if labels else ['None'] for labels in y_raw]

  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')
  data[col] = pd.to_datetime(data[col], errors='coerce')


In [138]:
possible_labels = list(set(chain.from_iterable(y_raw)))

# Binarize labels
mlb = MultiLabelBinarizer(classes=possible_labels)
y = mlb.fit_transform(y_raw)

# ---- 2. Train/test split ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- 3. Split the data into training and validation sets ----
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

In [None]:
# Define parameter grid for AdaBoostClassifier
param_dist_ada = {
    'base_estimator__n_estimators': np.arange(25, 201, 25),
    'base_estimator__learning_rate': np.linspace(0.01, 2.0, 20),
    'base_estimator__estimator__max_depth': [1, 2, 3, 4, 5, 6]
}

# Create base estimator
base_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(),
    random_state=42
)

# Create ClassifierChain
classifier_chain = ClassifierChain(base_ada, random_state=42)

# Set up random search for AdaBoost
random_search_ada = RandomizedSearchCV(
    estimator=classifier_chain,
    param_distributions=param_dist_ada,
    n_iter=20,
    cv=5,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search_ada.fit(X_train_val, y_train_val)
print(random_search_ada)


Fitting 5 folds for each of 15 candidates, totalling 75 fits
RandomizedSearchCV(cv=5,
                   estimator=ClassifierChain(base_estimator=AdaBoostClassifier(estimator=DecisionTreeClassifier(),
                                                                               random_state=42),
                                             random_state=42),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'base_estimator__estimator__max_depth': [1,
                                                                                 2,
                                                                                 3,
                                                                                 4,
                                                                                 5,
                                                                                 6],
                                        'base_estimator__learning_rate': array([0.01      , 0

In [140]:
print(f"Best parameters: {random_search_ada.best_params_}")
print(f"Best cross-validation score: {random_search_ada.best_score_:.4f}")

Best parameters: {'base_estimator__n_estimators': np.int64(100), 'base_estimator__learning_rate': np.float64(1.1621052631578948), 'base_estimator__estimator__max_depth': 5}
Best cross-validation score: 0.4974


In [159]:
# Train the final model with best parameters on the full training set
best_params_ada = random_search_ada.best_params_
print(f"Best parameters for AdaBoost: {best_params_ada}")
print(f"Best cross-validation score for AdaBoost: {random_search_ada.best_score_:.4f}")
# Train the final model with best parameters on full training set
best_ada = AdaBoostClassifier(
    n_estimators=best_params_ada['base_estimator__n_estimators'],
    learning_rate=best_params_ada['base_estimator__learning_rate'],
    estimator=DecisionTreeClassifier(
        max_depth=best_params_ada['base_estimator__estimator__max_depth']
    ),
    random_state=42
)

classifier_chain = ClassifierChain(best_ada, random_state=42)

# Fit the final model on the full training set
classifier_chain.fit(X_train, y_train)

# Predict on test/validation split
y_pred_ada = classifier_chain.predict(X_test)
# Print classification report
print("Final AdaBoost Classification report:")
print(classification_report(y_test, y_pred_ada, target_names=mlb.classes_))

Best parameters for AdaBoost: {'base_estimator__n_estimators': np.int64(100), 'base_estimator__learning_rate': np.float64(1.1621052631578948), 'base_estimator__estimator__max_depth': 5}
Best cross-validation score for AdaBoost: 0.4974
Final AdaBoost Classification report:
                   precision    recall  f1-score   support

    HEP - Hepatic       0.98      0.97      0.97       100
      BRA - Brain       1.00      1.00      1.00        10
 PER - Peritoneum       1.00      1.00      1.00         4
  PUL - Pulmonary       1.00      0.96      0.98        72
     PLE - Pleura       1.00      1.00      1.00         3
LYM - Lymph nodes       0.98      0.96      0.97        97
      BON - Bones       1.00      0.96      0.98       248
MAR - Bone Marrow       0.00      0.00      0.00         0
       SKI - Skin       1.00      1.00      1.00        14
             None       1.00      1.00      1.00      7530
   ADR - Adrenals       0.00      0.00      0.00         0
      OTH - Other 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [184]:
# Convert prediction vectors to lists of metastasis sites using mlb.classes_
def vectors_to_metastasis_lists(y_pred, mlb):
    preds = [
        [mlb.classes_[i] for i, val in enumerate(row) if int(val) == 1]
        for row in y_pred
    ]
    # remove 'None' from the lists
    preds = [[label for label in labels if label != 'None'] for labels in preds]
    # convert the inner lists to strings
    preds = [str(labels) for labels in preds]
    return preds

# Example usage:
y_pred_ada_sites = vectors_to_metastasis_lists(y_pred_ada, mlb)


In [185]:
X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')
X_test_feats = preprocess(X_test_feats)

  X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],


In [None]:
# Predict on the test features
y_preds = classifier_chain.predict(X_test_feats)
# Convert prediction vectors to lists of metastasis sites using mlb.classes_
y_preds_sites = vectors_to_metastasis_lists(y_preds, mlb)
print("Unique inner lists in y_preds_sites:")
y_preds_sites

In [188]:
# Convert the predictions to string format for DataFrame
y_preds_str = [str(labels) for labels in y_preds_sites]
y_preds = pd.DataFrame(y_preds_str, columns=['אבחנה-Location of distal metastases'], dtype=object)
print(y_preds_str)

['[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', "['HEP - Hepatic', 'BON - Bones']", '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]

In [189]:
print(y_preds)

      אבחנה-Location of distal metastases
0                                      []
1                                      []
2                                      []
3                                      []
4                                      []
...                                   ...
16442                                  []
16443                                  []
16444                                  []
16445                                  []
16446                                  []

[16447 rows x 1 columns]


In [191]:
import numpy as np
import pandas as pd
import ast
import sys

from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score

from itertools import chain

sys.path.append(r"C:\\Users\\roeed\\OneDrive\\Documents\\Projects\\iml-hack-oncology")

from preprocessing.preprocess_data import preprocess
import preprocessing.data_completion as data_completion

import os

np.random.seed(42)
top_k = 3

# ---- 1. Import the data ----
father_folder = os.path.dirname(os.getcwd())
X = pd.read_csv(f'{father_folder}/train_test_splits/train_split.feats.csv')
y_raw = pd.read_csv(f'{father_folder}/train_test_splits/train_split.labels.0.csv')

# Load the test features for the final predictions
X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')
preprocess(X_test_feats)

# change the column name to 'metastasis' for consistency
y_raw.rename(columns={'אבחנה-Location of distal metastases': 'metastasis'}, inplace=True)

# Convert string to actual list
y_raw['metastasis'] = y_raw['metastasis'].apply(ast.literal_eval)

preprocess(X)

# Convert the 'metastasis' column to a list of unique labels
y_raw = y_raw['metastasis'].tolist()

# convert inner empty lists to list with the string 'None'
y_raw = [labels if labels else ['None'] for labels in y_raw]

possible_labels = list(set(chain.from_iterable(y_raw)))

# Binarize labels
mlb = MultiLabelBinarizer(classes=possible_labels)
y = mlb.fit_transform(y_raw)

# ---- 2. Train/test split ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using a Random Forest classifier with OneVsRest strategy
model = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=5))
model.fit(X_train, y_train)

# ---- 4. Prediction with top-3 filtering ----
y_proba = model.predict_proba(X_test)

# Apply top-3 threshold per patient
y_pred_topk = np.zeros_like(y_proba)
for i in range(y_proba.shape[0]):
    top_indices = np.argsort(y_proba[i])[-top_k:]
    y_pred_topk[i, top_indices] = 1


# ---- 5. Evaluation ----
print("Classification report with top-3 filtered predictions:")
print(classification_report(y_test, y_pred_topk, target_names=mlb.classes_))
print("Micro-F1:", f1_score(y_test, y_pred_topk, average='micro'))
print("Macro-F1:", f1_score(y_test, y_pred_topk, average='macro'))


# --- 6. Randomized Search for Hyperparameter Tuning ---

# Split the training data into a smaller training and validation set for hyperparameter tuning
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.8, random_state=42)

# Define parameter grid for AdaBoostClassifier
param_dist_ada = {
    'base_estimator__n_estimators': np.arange(25, 201, 25),
    'base_estimator__learning_rate': np.linspace(0.01, 2.0, 20),
    'base_estimator__estimator__max_depth': [1, 2, 3, 4, 5, 6]
}

# Create base estimator
base_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(),
    random_state=42
)

# Create ClassifierChain
classifier_chain = ClassifierChain(base_ada, random_state=42)

# Set up random search for AdaBoost
random_search_ada = RandomizedSearchCV(
    estimator=classifier_chain,
    param_distributions=param_dist_ada,
    n_iter=20,
    cv=5,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search_ada.fit(X_train_val, y_train_val)

# Train the final model with best parameters on the full training set
best_params_ada = random_search_ada.best_params_
print(f"Best parameters for AdaBoost: {best_params_ada}")
print(f"Best cross-validation score for AdaBoost: {random_search_ada.best_score_:.4f}")
# Train the final model with best parameters on full training set
best_ada = AdaBoostClassifier(
    n_estimators=best_params_ada['base_estimator__n_estimators'],
    learning_rate=best_params_ada['base_estimator__learning_rate'],
    estimator=DecisionTreeClassifier(
        max_depth=best_params_ada['base_estimator__estimator__max_depth']
    ),
    random_state=42
)

classifier_chain_ada = ClassifierChain(best_ada, random_state=42)

# Fit the final model on the full training set
classifier_chain_ada.fit(X_train, y_train)

# Predict on test/validation split
y_pred_ada = classifier_chain_ada.predict(X_test)

# Print classification report
print("AdaBoost Classification report:")
print(classification_report(y_test, y_pred_ada, target_names=mlb.classes_))

  X_test_feats = pd.read_csv(f'{father_folder}/train_test_splits/test.feats.csv')
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],


Classification report with top-3 filtered predictions:
                   precision    recall  f1-score   support

    HEP - Hepatic       0.64      0.93      0.76       100
      BRA - Brain       0.57      0.80      0.67        10
 PER - Peritoneum       0.75      0.75      0.75         4
  PUL - Pulmonary       0.65      0.90      0.76        72
     PLE - Pleura       0.23      1.00      0.38         3
LYM - Lymph nodes       0.68      0.89      0.77        97
      BON - Bones       0.77      0.91      0.84       248
MAR - Bone Marrow       0.00      0.00      0.00         0
       SKI - Skin       0.43      0.93      0.59        14
             None       0.97      1.00      0.98      7530
   ADR - Adrenals       0.00      0.00      0.00         0
      OTH - Other       0.00      1.00      0.00         3

        micro avg       0.34      0.99      0.51      8081
        macro avg       0.47      0.76      0.54      8081
     weighted avg       0.95      0.99      0.97      8081

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for AdaBoost: {'base_estimator__n_estimators': np.int64(175), 'base_estimator__learning_rate': np.float64(1.266842105263158), 'base_estimator__estimator__max_depth': 4}
Best cross-validation score for AdaBoost: 0.4881
AdaBoost Classification report:
                   precision    recall  f1-score   support

    HEP - Hepatic       0.93      0.97      0.95       100
      BRA - Brain       1.00      1.00      1.00        10
 PER - Peritoneum       1.00      1.00      1.00         4
  PUL - Pulmonary       0.99      0.96      0.97        72
     PLE - Pleura       1.00      1.00      1.00         3
LYM - Lymph nodes       0.98      0.96      0.97        97
      BON - Bones       1.00      0.96      0.98       248
MAR - Bone Marrow       0.00      0.00      0.00         0
       SKI - Skin       1.00      1.00      1.00        14
             None       1.00      1.00      1.00      7530
   ADR - Adrenals       0.00      0.00      0.00         0
      OTH - Other       1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [192]:
y_preds = classifier_chain_ada.predict(X_test_feats)

In [196]:
# Convert prediction vectors to lists of metastasis sites using mlb.classes_
def vectors_to_metastasis_lists(y_pred, mlb):
    preds = [
        [mlb.classes_[i] for i, val in enumerate(row) if int(val) == 1]
        for row in y_pred
    ]
    # remove 'None' from the lists
    preds = [[label for label in labels if label != 'None'] for labels in preds]
    # convert the inner lists to strings
    preds = [str(labels) for labels in preds]
    preds = pd.DataFrame(preds, columns=['אבחנה-Location of distal metastases'])
    return preds

# Example usage:
y_preds_df = vectors_to_metastasis_lists(y_preds, mlb)

y_preds_df

Unnamed: 0,אבחנה-Location of distal metastases
0,[]
1,[]
2,[]
3,[]
4,[]
...,...
16442,[]
16443,[]
16444,[]
16445,[]


In [199]:
# Save the predictions to a CSV file
y_preds_df.to_csv(f'{father_folder}/train_test_splits/predictions_metastasis.csv', index=False, encoding='utf-8-sig')

In [216]:
X_full = pd.read_csv(f'{father_folder}/train_test_splits/train.feats.csv')
preprocess(X_full)
y_train_preds = classifier_chain_ada.predict(X_full)
# Convert training predictions to lists of metastasis sites using mlb.classes_
y_train_preds_df = vectors_to_metastasis_lists(y_train_preds, mlb)
# Save the training predictions to a CSV file
y_train_preds_df.to_csv(f'{father_folder}/train_test_splits/train_predictions_metastasis.csv', index=False, encoding='utf-8-sig', header=True)
y_train_preds_df

  X_full = pd.read_csv(f'{father_folder}/train_test_splits/train.feats.csv')
  row[Columns.SURGERY_DATE1],
  row[Columns.SURGERY_DATE1],


Unnamed: 0,אבחנה-Location of distal metastases
0,[]
1,[]
2,[]
3,[]
4,[]
...,...
49346,[]
49347,[]
49348,[]
49349,[]


In [None]:
y_raw = pd.read_csv(f'{father_folder}/train_test_splits/train.labels.0.csv')
# len of y_raw
len(y_raw)


49351

In [214]:
y_raw

Unnamed: 0,אבחנה-Location of distal metastases
0,[]
1,[]
2,[]
3,[]
4,[]
...,...
49346,[]
49347,[]
49348,[]
49349,[]


In [217]:
y_train_preds_df = pd.read_csv(f'{father_folder}/train_test_splits/train_predictions_metastasis.csv')
# len of y_train_preds_df
len(y_train_preds_df)

49351

In [215]:
y_train_preds_df

Unnamed: 0,[]
0,[]
1,[]
2,[]
3,[]
4,[]
...,...
49345,[]
49346,[]
49347,[]
49348,[]
