#Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("model1.csv")
df.head()

Unnamed: 0,Patient ID,sex,Age,Height,Weight,BMI,waist,Fasting Blood sugar,Cre,Urea,...,Ferritin,Insulin,CRP,Vitamin A,OH,Steatosis stage,CAP score,E score,Fibroscan S,Fibroscan F
0,TAL01331,M,34,186,96.0,27.7,102.0,102,1.0,30,...,131.51,19.55,2.2,57.0,25.0,3,310.0,5.3,3,1
1,TAL02130,M,48,173,88.0,29.0,103.0,104,1.0,30,...,158.0,10.0,1.0,29.0,25.0,1,289.0,3.6,3,0
2,TAL03101,M,52,173,88.0,29.0,103.0,99,1.14,38,...,271.0,10.0,2.0,29.0,25.0,1,237.0,4.5,0,1
3,TAL04102,F,68,173,88.0,29.0,103.0,98,1.0,30,...,158.0,10.0,1.0,29.0,25.0,1,240.0,9.0,0,2
4,TAL05211,F,68,173,88.0,29.0,103.0,100,1.0,30,...,158.0,10.0,2.0,29.0,25.0,2,260.0,6.0,1,1


In [3]:
df.shape

(113, 62)

In [4]:
features = [
    "Age", "sex", "BMI", "waist", "Fasting Blood sugar",
    "Triglyceride", "Cholestrol", "HDL", "LDL",
    "GGT", "AST (SGOT)", "ALT (SGPT)", "HbA1c", "Insulin"
]

In [5]:
X = df[features].copy()
X.loc[:, "sex"] = X["sex"].map({"M": 1, "F": 0})

y = df["Steatosis stage"].apply(lambda x: 0 if x <= 1 else 1)

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

In [8]:
from imblearn.over_sampling import SMOTE

In [9]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_imputed, y)

In [10]:
print(f"Original Class Distribution: {np.bincount(y)}")
print(f"Resampled Class Distribution: {np.bincount(y_resampled)}")

Original Class Distribution: [69 44]
Resampled Class Distribution: [69 69]


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully for X_train and X_test.")

Features scaled successfully for X_train and X_test.


#Building Logistic Regression Model and Finetuning with Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Instantiate LogisticRegression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit GridSearchCV to the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters found: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.7455


In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Get the best estimator from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions on the scaled test data
y_pred = best_model.predict(X_test_scaled)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Calculate and print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Accuracy: 0.5000

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.47      0.50        15
           1       0.47      0.54      0.50        13

    accuracy                           0.50        28
   macro avg       0.50      0.50      0.50        28
weighted avg       0.51      0.50      0.50        28


Confusion Matrix:
[[7 8]
 [6 7]]


##Saving Best Logistic Regression Model

In [16]:
import pickle


filename = 'best_logistic_reg.pkl'
pickle.dump(best_model, open(filename, 'wb'))

print(f"Best Logistic Regression model saved as {filename}")

Best Logistic Regression model saved as best_logistic_reg.pkl


#Building Voting Classifier and Comparison

##Importing Required Models

In [17]:
with open('mlrp_model_v1.pkl', 'rb') as file:
    mlrp_model = pickle.load(file)

print("Model 'mlrp_model_v1.pkl' loaded successfully.")

Model 'mlrp_model_v1.pkl' loaded successfully.


In [18]:
with open('best_logistic_reg.pkl', 'rb') as file:
    logistic_reg_model = pickle.load(file)

print("Model 'best_logistic_reg.pkl' loaded successfully.")

Model 'best_logistic_reg.pkl' loaded successfully.


##Building Voting Classifier

In [26]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [20]:
ensemble_model = VotingClassifier(estimators=[
    ('mlrp', mlrp_model),
    ('logistic', logistic_reg_model)
], voting='hard', n_jobs=-1) # 'hard' voting uses predicted class labels

ensemble_model.fit(X_train_scaled, y_train)

print("Ensemble (VotingClassifier) model trained successfully.")

Ensemble (VotingClassifier) model trained successfully.


##Comparing Models

In [21]:
y_pred_ensemble = ensemble_model.predict(X_test_scaled)

accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Test Accuracy: {accuracy_ensemble:.4f}")

print("\nEnsemble Model Classification Report:")
print(classification_report(y_test, y_pred_ensemble))

print("\nEnsemble Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_ensemble))

Ensemble Model Test Accuracy: 0.5714

Ensemble Model Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.60      0.60        15
           1       0.54      0.54      0.54        13

    accuracy                           0.57        28
   macro avg       0.57      0.57      0.57        28
weighted avg       0.57      0.57      0.57        28


Ensemble Model Confusion Matrix:
[[9 6]
 [6 7]]


In [22]:
y_pred_mlrp = mlrp_model.predict(X_test_scaled)

accuracy_mlrp = accuracy_score(y_test, y_pred_mlrp)
print(f"MLRP Model Test Accuracy: {accuracy_mlrp:.4f}")

print("\nMLRP Model Classification Report:")
print(classification_report(y_test, y_pred_mlrp))

print("\nMLRP Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_mlrp))

MLRP Model Test Accuracy: 0.5357

MLRP Model Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70        15
           1       0.00      0.00      0.00        13

    accuracy                           0.54        28
   macro avg       0.27      0.50      0.35        28
weighted avg       0.29      0.54      0.37        28


MLRP Model Confusion Matrix:
[[15  0]
 [13  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
