In [9]:
# === STEP 1: Import Required Libraries ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import pickle

# === STEP 2: Load Dataset ===
df = pd.read_csv("cirrhosis.csv")

# === STEP 3: Drop rows with missing target variable ===
df = df.dropna(subset=["Stage"]).copy()

# === STEP 4: Encode Categorical Features ===
df["Sex"] = df["Sex"].map({"M": 1, "F": 0})
df["Ascites"] = df["Ascites"].map({"Y": 1, "N": 0})
df["Hepatomegaly"] = df["Hepatomegaly"].map({"Y": 1, "N": 0})
df["Spiders"] = df["Spiders"].map({"Y": 1, "N": 0})
df["Edema"] = df["Edema"].map({
    "No edema": 0,
    "Edema no diuretics": 0.5,
    "Edema despite diuretics": 1
})

# === STEP 5: Drop Irrelevant/High-Missing Columns ===
df.drop(columns=["ID", "Status", "Drug", "N_Days"], inplace=True, errors="ignore")

# === STEP 6: Convert all columns to numeric (in case any remain non-numeric) ===
df = df.apply(pd.to_numeric, errors="coerce")

# === STEP 7: Fill Remaining NaNs with Column Means ===
df = df.fillna(df.mean(numeric_only=True))

# === STEP 8: Split Features and Labels ===
X = df.drop("Stage", axis=1)
y = df["Stage"]-1

# === STEP 9: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === STEP 10: Scaling ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === STEP 11: Train the Model ===
model = XGBClassifier(eval_metric='mlogloss', random_state=42)
model.fit(X_train_scaled, y_train)

# === STEP 12: Evaluate the Model ===
y_pred = model.predict(X_test_scaled)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

# === STEP 13: Save Model and Scaler ===
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print(" Model and scaler saved successfully.")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         6
         1.0       0.18      0.17      0.17        18
         2.0       0.47      0.57      0.52        28
         3.0       0.62      0.58      0.60        31

    accuracy                           0.48        83
   macro avg       0.57      0.45      0.49        83
weighted avg       0.50      0.48      0.48        83

 Model and scaler saved successfully.


In [10]:
# Check NaNs in full dataset
print("NaNs before splitting (whole df):")
print(df.isnull().sum())
print("Total NaNs:", df.isnull().sum().sum())


NaNs before splitting (whole df):
Age                0
Sex                0
Ascites            0
Hepatomegaly       0
Spiders            0
Edema            412
Bilirubin          0
Cholesterol        0
Albumin            0
Copper             0
Alk_Phos           0
SGOT               0
Tryglicerides      0
Platelets          0
Prothrombin        0
Stage              0
dtype: int64
Total NaNs: 412


In [11]:
# === Liver Cirrhosis Stage Prediction: Model Training Script ===

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import pickle

# === STEP 1: Load and Clean Dataset ===
df = pd.read_csv("cirrhosis.csv")

# Drop rows with missing target
df = df.dropna(subset=["Stage"]).copy()

# Encode categorical columns
df["Sex"] = df["Sex"].map({"M": 1, "F": 0})
df["Ascites"] = df["Ascites"].map({"Y": 1, "N": 0})
df["Hepatomegaly"] = df["Hepatomegaly"].map({"Y": 1, "N": 0})
df["Spiders"] = df["Spiders"].map({"Y": 1, "N": 0})

# Fix and map 'Edema' values safely
df["Edema"] = df["Edema"].astype(str).str.strip().str.lower()

# Print unique values for verification (optional)
print("Unique Edema values:", df["Edema"].unique())

# Map known values
edema_mapping = {
    "no edema": 0,
    "edema no diuretics": 0.5,
    "edema despite diuretics": 1,
    "0": 0,
    "0.0": 0,
    "0.5": 0.5,
    "1": 1,
    "1.0": 1
}
# Final mapping for Edema values based on your actual data
df["Edema"] = df["Edema"].astype(str).str.lower().str.strip()

# Map the actual values seen: 'y', 'n', 's'
df["Edema"] = df["Edema"].map({
    "n": 0,    # No edema
    "y": 1,    # Edema present
    "s": 0.5   # Some edema (assumed meaning)
})


# Drop unused or high-missing columns
df.drop(columns=["ID", "Status", "Drug", "N_Days"], inplace=True, errors="ignore")

# Convert all to numeric and fill any remaining NaNs
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(df.mean(numeric_only=True))

# Sanity check
print("NaNs in df after mapping and fill:", df.isnull().sum())
print("Total NaNs:", df.isnull().sum().sum())

# === STEP 2: Split Features and Labels ===
X = df.drop("Stage", axis=1)
y = df["Stage"] - 1  # Shift labels to start from 0 for XGBoost

# === STEP 3: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Confirm NaNs in split
print("NaNs in X_train:", np.isnan(X_train).sum().sum())
print("NaNs in X_test:", np.isnan(X_test).sum().sum())

# === STEP 4: Scaling ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === STEP 5: Train the Model ===
model = XGBClassifier(eval_metric='mlogloss', random_state=42)
model.fit(X_train_scaled, y_train)

# === STEP 6: Evaluate the Model ===
y_pred = model.predict(X_test_scaled)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

# === STEP 7: Save the Model and Scaler ===
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print(" Model and Scaler saved successfully.")

Unique Edema values: ['y' 'n' 's']
NaNs in df after mapping and fill: Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64
Total NaNs: 0
NaNs in X_train: 0
NaNs in X_test: 0
=== Classification Report ===
              precision    recall  f1-score   support

         0.0       1.00      0.17      0.29         6
         1.0       0.21      0.22      0.22        18
         2.0       0.52      0.61      0.56        28
         3.0       0.57      0.55      0.56        31

    accuracy                           0.47        83
   macro avg       0.57      0.39      0.40        83
weighted avg       0.50      0.47      0.46        83

 Model and Scaler saved successfully.


In [12]:
print("NaNs in df after mapping and fill:", df.isnull().sum())
print("Total NaNs:", df.isnull().sum().sum())


NaNs in df after mapping and fill: Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64
Total NaNs: 0


In [13]:
print(df['Stage'].value_counts())

Stage
3.0    155
4.0    144
2.0     92
1.0     21
Name: count, dtype: int64


In [19]:
from imblearn.over_sampling import SMOTE

# After train-test split and scaling:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE: ", np.bincount(y_resampled))

# Train the model on balanced data
model = XGBClassifier(eval_metric='mlogloss', random_state=42)
model.fit(X_resampled, y_resampled)
print("Model trained successfully!")  # cleaner on GitHub



  print("Before SMOTE:", np.bincount(y_train))
  print("After SMOTE: ", np.bincount(y_resampled))


Before SMOTE: [ 15  74 127 113]
After SMOTE:  [127 127 127 127]
Model trained successfully!


In [15]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)


In [20]:
model.fit(X_resampled, y_resampled)
print(" Model training complete.")

 Model training complete.


In [21]:
y_pred = model.predict(X_test_scaled)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.50      0.33      0.40         6
         1.0       0.11      0.11      0.11        18
         2.0       0.43      0.46      0.45        28
         3.0       0.63      0.61      0.62        31

    accuracy                           0.43        83
   macro avg       0.42      0.38      0.39        83
weighted avg       0.44      0.43      0.44        83

