In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
df = pd.read_csv('liver_cirrhosis.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         25000 non-null  int64  
 1   Status         25000 non-null  object 
 2   Drug           25000 non-null  object 
 3   Age            25000 non-null  int64  
 4   Sex            25000 non-null  object 
 5   Ascites        25000 non-null  object 
 6   Hepatomegaly   25000 non-null  object 
 7   Spiders        25000 non-null  object 
 8   Edema          25000 non-null  object 
 9   Bilirubin      25000 non-null  float64
 10  Cholesterol    25000 non-null  float64
 11  Albumin        25000 non-null  float64
 12  Copper         25000 non-null  float64
 13  Alk_Phos       25000 non-null  float64
 14  SGOT           25000 non-null  float64
 15  Tryglicerides  25000 non-null  float64
 16  Platelets      25000 non-null  float64
 17  Prothrombin    25000 non-null  float64
 18  Stage 

In [7]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,2221,C,Placebo,18499,F,N,Y,N,N,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256.0,9.9,1
1,1230,C,Placebo,19724,M,Y,N,Y,N,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220.0,10.8,2
2,4184,C,Placebo,11839,F,N,N,N,N,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225.0,10.0,2
3,2090,D,Placebo,16467,F,N,N,N,N,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151.0,10.2,2
4,2105,D,Placebo,21699,F,N,Y,N,N,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151.0,11.5,1


In [9]:
df['Stage'].value_counts()

Stage
2    8441
3    8294
1    8265
Name: count, dtype: int64

In [5]:
# Encode categorical features
label_encoders = {}
categorical_features = ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
encoding_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    encoding_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Print encoding dictionary
print("Encoding Dictionary:")
print(encoding_dict)


Encoding Dictionary:
{'Status': {'C': 0, 'CL': 1, 'D': 2}, 'Drug': {'D-penicillamine': 0, 'Placebo': 1}, 'Sex': {'F': 0, 'M': 1}, 'Ascites': {'N': 0, 'Y': 1}, 'Hepatomegaly': {'N': 0, 'Y': 1}, 'Spiders': {'N': 0, 'Y': 1}, 'Edema': {'N': 0, 'S': 1, 'Y': 2}}


In [11]:
X = df.drop(columns=["Stage"])  # Features
y = df["Stage"]  # Target

# Standardize numerical features
scaler = StandardScaler()
numeric_features = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Shift labels from [1, 2, 3] to [0, 1, 2] for model training
y = y - 1

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train XGBoost Classifier
xgb = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)

# Predictions
y_pred = xgb.predict(X_test)

# Map back the predictions and true labels to the original stages (1, 2, 3)
y_pred_stage = y_pred + 1
y_test_stage = y_test + 1

# Evaluate model
accuracy = accuracy_score(y_test_stage, y_pred_stage)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test_stage, y_pred_stage))
print("Confusion Matrix:\n", confusion_matrix(y_test_stage, y_pred_stage))


Accuracy: 0.9596
Classification Report:
               precision    recall  f1-score   support

           1       0.97      0.94      0.96      1653
           2       0.94      0.97      0.95      1688
           3       0.97      0.97      0.97      1659

    accuracy                           0.96      5000
   macro avg       0.96      0.96      0.96      5000
weighted avg       0.96      0.96      0.96      5000

Confusion Matrix:
 [[1561   67   25]
 [  39 1630   19]
 [  16   36 1607]]
