In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE

#Model tesets
from sklearn.linear_model import LogisticRegression
import joblib


In [2]:
X = pd.read_csv("X.csv", index_col=0)  
y = pd.read_csv("y.csv")["label"]

In [3]:
y = y.str.replace("leukemia class: ", "", regex=False)
y = y.str.strip()
print(y.value_counts())

label
CLL                                                448
AML with normal karyotype + other abnormalities    351
c-ALL/Pre-B-ALL without t(9;22)                    237
MDS                                                206
T-ALL                                              174
c-ALL/Pre-B-ALL with t(9;22)                       122
CML                                                 76
Non-leukemia and healthy bone marrow                74
Pro-B-ALL with t(11q23)/MLL                         70
ALL with t(12;21)                                   58
AML complex aberrant karyotype                      48
ALL with hyperdiploid karyotype                     40
AML with t(8;21)                                    40
AML with t(11q23)/MLL                               38
AML with t(15;17)                                   37
ALL with t(1;19)                                    36
AML with inv(16)/t(16;16)                           28
mature B-ALL with t(8;14)                           13
Name

In [4]:
def map_label(label: str) -> str:
    label = label.upper()  # normalize
    
    if "ALL" in label:
        return "ALL"
    elif "AML" in label:
        return "AML"
    elif "CML" in label:
        return "CML"
    elif "CLL" in label:
        return "CLL"
    elif "MDS" in label:
        return "MDS"
    else:
        return "Healthy"  

In [5]:
y = y.apply(map_label)
print(y.value_counts())

label
ALL        750
AML        542
CLL        448
MDS        206
CML         76
Healthy     74
Name: count, dtype: int64


In [8]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(dict(zip(le.classes_, range(len(le.classes_)))))

{'ALL': 0, 'AML': 1, 'CLL': 2, 'CML': 3, 'Healthy': 4, 'MDS': 5}


In [9]:
print("NaNs in data:", np.isnan(X.values).sum())
print("Infs in data:", np.isinf(X.values).sum())
X = X.fillna(0)

NaNs in data: 0
Infs in data: 0


In [10]:
selector = SelectKBest(score_func=f_classif, k=10000)
X = selector.fit_transform(X, y_encoded)

 54639 54640 54641 54645 54646 54647 54648 54649 54650 54651 54652 54653
 54654 54655 54656 54657 54658 54659 54660 54661 54662 54663 54664 54665
 54666 54667 54668 54669 54670 54671 54672 54673 54674] are constant.
  f = msb / msw


In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=69, stratify=y_encoded
)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(1676, 10000) (1676,) (420, 10000) (420,)


In [14]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_res))

# X_train_res, y_train_res = X_train, y_train

Before SMOTE: [600 433 358  61  59 165]
After SMOTE : [600 600 600 600 600 600]


In [15]:
log_reg = LogisticRegression(
    max_iter=10000, 
    solver="liblinear", 
    penalty="l2",
    C=1.0,
    random_state=69
)
log_reg.fit(X_res, y_res)




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,69
,solver,'liblinear'
,max_iter,10000


In [16]:
y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.919047619047619

Classification Report:
               precision    recall  f1-score   support

         ALL       1.00      0.94      0.97       150
         AML       0.97      0.87      0.92       109
         CLL       0.99      0.98      0.98        90
         CML       0.71      1.00      0.83        15
     Healthy       0.50      0.73      0.59        15
         MDS       0.73      0.88      0.80        41

    accuracy                           0.92       420
   macro avg       0.82      0.90      0.85       420
weighted avg       0.94      0.92      0.92       420


Confusion Matrix:
 [[141   1   1   3   3   1]
 [  0  95   0   2   4   8]
 [  0   0  88   0   2   0]
 [  0   0   0  15   0   0]
 [  0   0   0   0  11   4]
 [  0   2   0   1   2  36]]


In [17]:
mlp = MLPClassifier(
    hidden_layer_sizes=(144, 72),  
    activation="logistic",         
    solver="adam",              
    max_iter=500,                   
    random_state=69,
    early_stopping=True,
    n_iter_no_change=20
)

mlp.fit(X_res, y_res)

0,1,2
,hidden_layer_sizes,"(144, ...)"
,activation,'logistic'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500
,shuffle,True


In [18]:
y_pred = mlp.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9571428571428572

Classification Report:
               precision    recall  f1-score   support

         ALL       0.99      0.97      0.98       150
         AML       0.97      0.93      0.95       109
         CLL       0.99      1.00      0.99        90
         CML       1.00      1.00      1.00        15
     Healthy       0.91      0.67      0.77        15
         MDS       0.77      0.98      0.86        41

    accuracy                           0.96       420
   macro avg       0.94      0.92      0.93       420
weighted avg       0.96      0.96      0.96       420


Confusion Matrix:
 [[146   3   1   0   0   0]
 [  1 101   0   0   0   7]
 [  0   0  90   0   0   0]
 [  0   0   0  15   0   0]
 [  0   0   0   0  10   5]
 [  0   0   0   0   1  40]]


In [19]:
joblib.dump(log_reg, "log_reg_model.pkl")
joblib.dump(mlp, "mlp_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

print("Model, scaler, and label encoder saved.")

Model, scaler, and label encoder saved.
