In [9]:
# Import necessary libraries
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasClassifier

In [10]:
# Load dataset
data = pd.read_csv('/kaggle/input/aml-with-target/all_patients_cell_counts_with_target.csv')

In [11]:
# Split data into features (X) and target (y)
X = data.drop(columns=['target', 'patient'])  
y = data['target']

In [12]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [14]:
# Calculate scale_pos_weight for each class for XGBoost
class_counts = Counter(y_train_smote)
scale_pos_weights = {cls: max(class_counts.values()) / count for cls, count in class_counts.items()}

In [15]:
le = LabelEncoder()
y_train_smote_encoded = le.fit_transform(y_train_smote)
y_test_encoded = le.transform(y_test)

In [16]:
xgb_model = xgb.XGBClassifier(
    scale_pos_weight=list(scale_pos_weights.values()), 
    use_label_encoder=False, 
    eval_metric='mlogloss', 
    random_state=42
)

In [17]:
num_leaves = 2 ** 8  
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='multiclass',
    num_class=len(scale_pos_weights),
    metric='multi_logloss',
    max_depth=8,
    num_leaves=num_leaves,
    learning_rate=0.1,
    n_estimators=500
)

cat_model = CatBoostClassifier(
    iterations=500, 
    depth=6, 
    learning_rate=0.1, 
    random_seed=42, 
    verbose=0
)

In [18]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)

In [19]:
def create_nn_model():
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(len(le.classes_), activation='softmax'))  # Output layer for multiclass classification
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [20]:
# Wrap the neural network model using SciKerasClassifier
nn_model = KerasClassifier(model=create_nn_model, epochs=50, batch_size=16, verbose=0)

In [21]:
# Combine the models in a Voting Ensemble with soft voting
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('cat', cat_model),
        ('lgb', lgb_model),
        ('nn', nn_model)  
    ],
    voting='soft'  # Use soft voting to account for prediction probabilities
)

In [None]:
# Train the voting ensemble on SMOTE-balanced data
voting_model.fit(X_train_smote, y_train_smote)

In [23]:
# Evaluate the ensemble model
y_pred = voting_model.predict(X_test)
print("Voting Ensemble with SMOTE and Neural Network:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))

Voting Ensemble with SMOTE and Neural Network:
Accuracy: 0.5
               precision    recall  f1-score   support

   CBFB_MYH11       0.25      0.33      0.29         6
         NPM1       0.50      0.30      0.37        10
     PML_RARA       0.50      0.67      0.57         3
RUNX1_RUNX1T1       0.29      0.25      0.27         8
      control       0.77      0.91      0.83        11

     accuracy                           0.50        38
    macro avg       0.46      0.49      0.47        38
 weighted avg       0.49      0.50      0.49        38

