In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE

#Model tesets
from sklearn.linear_model import LogisticRegression
import joblib


In [None]:
X = pd.read_csv("X.csv", index_col=0)  
y = pd.read_csv("y.csv")["label"]

In [None]:
y = y.str.replace("leukemia class: ", "", regex=False)
y = y.str.strip()
print(y.value_counts())

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(dict(zip(le.classes_, range(len(le.classes_)))))

In [None]:
print("NaNs in data:", np.isnan(X.values).sum())
print("Infs in data:", np.isinf(X.values).sum())
X = X.fillna(0)


In [None]:
selector = SelectKBest(score_func=f_classif, k=10000)
X = selector.fit_transform(X, y_encoded)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

In [None]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE :", np.bincount(y_train_res))

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

In [None]:
log_reg = LogisticRegression(
    max_iter=10000, 
    solver="liblinear", 
    penalty="l2", 
    C=1.0,
    random_state=69
)
log_reg.fit(X_train_scaled, y_train_res)


In [None]:
y_pred = log_reg.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),  
    activation="relu",         
    solver="adam",              
    max_iter=500,                   
    random_state=69,
    early_stopping=True,
    n_iter_no_change=20
)

mlp.fit(X_train_scaled, y_train_res)

In [None]:
y_pred = mlp.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
joblib.dump(log_reg, "log_reg_model.pkl")
joblib.dump(mlp, "mlp_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

print("Model, scaler, and label encoder saved.")