In [1]:
import pandas as pd
import numpy as np
import os
import torch
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K
import xgboost as xgb

from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


def load_and_preprocess_data(dataset_path):
    train_variants = pd.read_csv(os.path.join(dataset_path, "training_variants/training_variants"))
    train_text = pd.read_csv(os.path.join(dataset_path, "training_text/training_text"), sep="\|\|", engine="python", names=["ID", "Text"], skiprows=1)

    df = pd.merge(train_variants, train_text, on="ID")
    df.drop(columns=["ID"], inplace=True)
    df.fillna("Unknown", inplace=True)

    label_encoder = LabelEncoder()
    df["Class"] = label_encoder.fit_transform(df["Class"])
    num_classes = len(label_encoder.classes_)

    gene_ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False).fit_transform(df[["Gene"]])
    variation_ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False).fit_transform(df[["Variation"]])

    return df, gene_ohe, variation_ohe, label_encoder, num_classes

def extract_tfidf(df):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
    return tfidf_vectorizer.fit_transform(df["Text"])

def extract_biobert_embeddings(text_list):
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
    model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to("cuda" if torch.cuda.is_available() else "cpu")
    embeddings = []
    for text in tqdm(text_list, desc="BioBERT"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            output = model(**inputs)
        cls_embedding = output.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embedding.squeeze())
    return np.array(embeddings)

def apply_imbalance_strategy(X, y, strategy):
    if strategy == 'a':
        return SMOTE(random_state=42).fit_resample(X, y)
    elif strategy == 'b':
        return RandomOverSampler(random_state=42).fit_resample(X, y)
    else:
        return X, y

def focal_loss(gamma=2., alpha=.25):
    def loss(y_true, y_pred):
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1. - epsilon)
        cross_entropy = -y_true * K.log(y_pred)
        weight = alpha * K.pow(1 - y_pred, gamma)
        loss = weight * cross_entropy
        return K.mean(K.sum(loss, axis=-1))
    return loss

def train_xgboost(X_train, y_train, X_test, y_test, use_weights=False):
    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train) if use_weights else None
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", num_class=9, objective="multi:softmax")
    model.fit(X_train, y_train, sample_weight=sample_weights)
    preds = model.predict(X_test)
    print("XGBoost Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

def train_keras_nn(X_train, y_train, X_test, y_test, use_focal=False):
    y_train_cat = to_categorical(y_train, num_classes=9)
    y_test_cat = to_categorical(y_test, num_classes=9)

    model = Sequential([
        Dense(512, activation='relu', input_dim=X_train.shape[1]),
        Dropout(0.3),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(9, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss=focal_loss() if use_focal else 'categorical_crossentropy',
                  metrics=['accuracy'])

    model.fit(X_train, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test, y_test_cat))
    preds = np.argmax(model.predict(X_test), axis=1)
    print("Keras NN Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

def train_stacked_ensemble(X_train, y_train, X_test, y_test):
    base_models = [
        ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")),
        ('rf', RandomForestClassifier()),
        ('et', ExtraTreesClassifier())
    ]
    ensemble = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())
    ensemble.fit(X_train, y_train)
    preds = ensemble.predict(X_test)
    print("Stacked Ensemble Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

def train_lightgbm(X_train, y_train, X_test, y_test):
    model = LGBMClassifier()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print("LightGBM Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

# === Driver ===
if __name__ == "__main__":
    dataset_path = "/content/drive/MyDrive/msk-redefining-cancer-treatment"
    df, gene_ohe, variation_ohe, label_encoder, num_classes = load_and_preprocess_data(dataset_path)

    text_mode = input("Choose text feature method (a: TF-IDF, b: BioBERT): ")
    imbalance_mode = input("Choose imbalance handling (a: SMOTE, b: RandomOverSampler, c: Class Weights, d: Focal Loss): ")
    model_type = input("Choose model (a: XGBoost, b: Keras NN, c: Stacked Ensemble, d: LightGBM): ")

    text_features = extract_tfidf(df) if text_mode == 'a' else extract_biobert_embeddings(df["Text"])
    from scipy.sparse import hstack
    X_all = hstack((gene_ohe, variation_ohe, text_features)) if text_mode == 'a' else np.hstack((gene_ohe, variation_ohe, text_features))
    y_all = df["Class"].values

    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, stratify=y_all, random_state=42)
    X_train, y_train = apply_imbalance_strategy(X_train, y_train, imbalance_mode)

    if model_type == 'a':
        train_xgboost(X_train, y_train, X_test, y_test, use_weights=(imbalance_mode == 'c'))
    elif model_type == 'b':
        train_keras_nn(X_train, y_train, X_test, y_test, use_focal=(imbalance_mode == 'd'))
    elif model_type == 'c':
        train_stacked_ensemble(X_train, y_train, X_test, y_test)
    elif model_type == 'd':
        train_lightgbm(X_train, y_train, X_test, y_test)

Choose text feature method (a: TF-IDF, b: BioBERT): a
Choose imbalance handling (a: SMOTE, b: RandomOverSampler, c: Class Weights, d: Focal Loss): a
Choose model (a: XGBoost, b: Keras NN, c: Stacked Ensemble, d: LightGBM): a


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.6421052631578947
              precision    recall  f1-score   support

           0       0.56      0.64      0.60       114
           1       0.61      0.54      0.57        91
           2       0.62      0.28      0.38        18
           3       0.68      0.65      0.67       137
           4       0.29      0.25      0.27        48
           5       0.80      0.58      0.67        55
           6       0.71      0.85      0.77       191
           7       0.00      0.00      0.00         4
           8       0.83      0.71      0.77         7

    accuracy                           0.64       665
   macro avg       0.57      0.50      0.52       665
weighted avg       0.64      0.64      0.63       665



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
