In [14]:
import pandas as pd

# Load the dataset
df = pd.read_csv("cirrhosis.csv")
print("Shape:", df.shape)
df.head(n=300)


Shape: (418, 20)


Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,296,1321,C,Placebo,11462,F,N,N,N,N,0.8,328.0,3.31,62.0,1105.0,137.00,95.0,293.0,10.9,4.0
296,297,533,CL,D-penicillamine,20449,M,N,Y,N,N,1.2,275.0,3.43,100.0,1142.0,75.00,91.0,217.0,11.3,4.0
297,298,1300,C,Placebo,19258,F,N,Y,N,N,1.1,340.0,3.37,73.0,289.0,97.00,93.0,243.0,10.2,3.0
298,299,1293,C,D-penicillamine,13913,F,N,N,N,N,2.4,342.0,3.76,90.0,1653.0,150.00,127.0,213.0,10.8,3.0


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             418 non-null    int64  
 1   N_Days         418 non-null    int64  
 2   Status         418 non-null    object 
 3   Drug           312 non-null    object 
 4   Age            418 non-null    int64  
 5   Sex            418 non-null    object 
 6   Ascites        312 non-null    object 
 7   Hepatomegaly   312 non-null    object 
 8   Spiders        312 non-null    object 
 9   Edema          418 non-null    object 
 10  Bilirubin      418 non-null    float64
 11  Cholesterol    284 non-null    float64
 12  Albumin        418 non-null    float64
 13  Copper         310 non-null    float64
 14  Alk_Phos       312 non-null    float64
 15  SGOT           312 non-null    float64
 16  Tryglicerides  282 non-null    float64
 17  Platelets      407 non-null    float64
 18  Prothrombi

In [3]:
df['Status'].value_counts()


Status
C     232
D     161
CL     25
Name: count, dtype: int64

In [4]:
# Convert target to binary
df['Status'] = df['Status'].replace({'D': 1, 'C': 0, 'CL': 0})

# Drop ID since it's useless for prediction
df = df.drop(columns=['ID'])

# Check conversion
df['Status'].value_counts()


  df['Status'] = df['Status'].replace({'D': 1, 'C': 0, 'CL': 0})


Status
0    257
1    161
Name: count, dtype: int64

In [5]:
df.isna().sum()


N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [6]:
import numpy as np

# 1. Convert Age from days to years
df['Age'] = df['Age'] / 365.0

# 2. Separate columns by type
cat_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
num_cols = [c for c in df.columns if c not in cat_cols + ['Status']]

# 3. Fill missing numeric with median
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# 4. Fill missing categorical with mode
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify no NaNs left
print(df.isna().sum().sum(), "missing values remain")

0 missing values remain


In [7]:
from sklearn.preprocessing import OneHotEncoder

# Columns to encode
cat_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

# One-hot encode
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Check shape and first few columns
print("New shape:", df_encoded.shape)
df_encoded.head()


New shape: (418, 20)


Unnamed: 0,N_Days,Status,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Drug_Placebo,Sex_M,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Edema_S,Edema_Y
0,400,1,58.805479,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,False,False,True,True,True,False,True
1,4500,0,56.484932,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,False,False,False,True,True,False,False
2,1012,1,70.120548,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,False,True,False,False,False,True,False
3,1925,1,54.778082,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,False,False,False,True,True,True,False
4,1504,0,38.131507,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,True,False,False,True,True,False,False


In [8]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df_encoded.drop(columns=["Status"])
y = df_encoded["Status"]

# Split into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (334, 19)
Test shape: (84, 19)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)


Accuracy: 0.8571428571428571


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
import numpy as np

# Get feature names and coefficients
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_[0]
})

# Sort by absolute value (strongest effects first)
feature_importance["Abs_Coeff"] = np.abs(feature_importance["Coefficient"])
feature_importance = feature_importance.sort_values("Abs_Coeff", ascending=False)

feature_importance.head(15)


Unnamed: 0,Feature,Coefficient,Abs_Coeff
4,Albumin,-0.950948,0.950948
14,Ascites_Y,0.623808,0.623808
13,Sex_M,0.623073,0.623073
16,Spiders_Y,0.402541,0.402541
18,Edema_Y,0.374077,0.374077
17,Edema_S,0.17109,0.17109
10,Prothrombin,0.168495,0.168495
12,Drug_Placebo,0.155975,0.155975
2,Bilirubin,0.153985,0.153985
15,Hepatomegaly_Y,0.042394,0.042394


In [11]:
import joblib

# X was df_encoded.drop(columns=["Status"])
feature_order = X.columns.tolist()

joblib.dump(model, "model.joblib")
joblib.dump(feature_order, "feature_order.joblib")

print("saved:", "model.joblib", "feature_order.joblib")


saved: model.joblib feature_order.joblib


In [12]:
joblib.load("feature_order.joblib")


['N_Days',
 'Age',
 'Bilirubin',
 'Cholesterol',
 'Albumin',
 'Copper',
 'Alk_Phos',
 'SGOT',
 'Tryglicerides',
 'Platelets',
 'Prothrombin',
 'Stage',
 'Drug_Placebo',
 'Sex_M',
 'Ascites_Y',
 'Hepatomegaly_Y',
 'Spiders_Y',
 'Edema_S',
 'Edema_Y']