In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler


#Model tesets
from sklearn.linear_model import LogisticRegression
import joblib


In [2]:
X = pd.read_csv("X.csv", index_col=0)  
y = pd.read_csv("y.csv")["label"]

In [3]:
y = y.str.replace("leukemia class: ", "", regex=False)
y = y.str.strip()
print(y.value_counts())

label
CLL                                                448
AML with normal karyotype + other abnormalities    351
c-ALL/Pre-B-ALL without t(9;22)                    237
MDS                                                206
T-ALL                                              174
c-ALL/Pre-B-ALL with t(9;22)                       122
CML                                                 76
Non-leukemia and healthy bone marrow                74
Pro-B-ALL with t(11q23)/MLL                         70
ALL with t(12;21)                                   58
AML complex aberrant karyotype                      48
ALL with hyperdiploid karyotype                     40
AML with t(8;21)                                    40
AML with t(11q23)/MLL                               38
AML with t(15;17)                                   37
ALL with t(1;19)                                    36
AML with inv(16)/t(16;16)                           28
mature B-ALL with t(8;14)                           13
Name

In [4]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(dict(zip(le.classes_, range(len(le.classes_)))))

{'ALL with hyperdiploid karyotype': 0, 'ALL with t(12;21)': 1, 'ALL with t(1;19)': 2, 'AML complex aberrant karyotype': 3, 'AML with inv(16)/t(16;16)': 4, 'AML with normal karyotype + other abnormalities': 5, 'AML with t(11q23)/MLL': 6, 'AML with t(15;17)': 7, 'AML with t(8;21)': 8, 'CLL': 9, 'CML': 10, 'MDS': 11, 'Non-leukemia and healthy bone marrow': 12, 'Pro-B-ALL with t(11q23)/MLL': 13, 'T-ALL': 14, 'c-ALL/Pre-B-ALL with t(9;22)': 15, 'c-ALL/Pre-B-ALL without t(9;22)': 16, 'mature B-ALL with t(8;14)': 17}


In [5]:
print("NaNs in data:", np.isnan(X.values).sum())
print("Infs in data:", np.isinf(X.values).sum())
X = X.fillna(0)


NaNs in data: 94320
Infs in data: 0


In [6]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
log_reg = LogisticRegression(
    max_iter=1000, 
    solver="liblinear", 
    penalty="l2", 
    C=1.0,
    random_state=42
)
log_reg.fit(X_train_scaled, y_train)




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [9]:
y_pred = log_reg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8595238095238096

Classification Report:
                                                  precision    recall  f1-score   support

                ALL with hyperdiploid karyotype       0.62      1.00      0.76         8
                              ALL with t(12;21)       0.71      1.00      0.83        12
                               ALL with t(1;19)       0.86      0.86      0.86         7
                 AML complex aberrant karyotype       0.75      0.90      0.82        10
                      AML with inv(16)/t(16;16)       0.75      1.00      0.86         6
AML with normal karyotype + other abnormalities       0.94      0.71      0.81        70
                          AML with t(11q23)/MLL       0.70      0.88      0.78         8
                              AML with t(15;17)       1.00      1.00      1.00         7
                               AML with t(8;21)       0.89      1.00      0.94         8
                                            CLL       1

In [10]:
joblib.dump(log_reg, "log_reg_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

print("Model, scaler, and label encoder saved.")

Model, scaler, and label encoder saved.
