In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('hepatitis_csv.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live


In [4]:
df.shape

(155, 20)

In [5]:
df.replace('?', np.nan, inplace=True)

# Convert all applicable columns to numeric (ignore errors from bool/categorical columns)
for col in ['bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove rows with negative values in numeric columns
numeric_cols = ['bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime']
df = df[(df[numeric_cols] >= 0).all(axis=1)]

# Drop or impute missing values (drop for simplicity)
df_cleaned = df.dropna()

In [6]:
df_cleaned.shape

(80, 20)

In [7]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

df_no_outliers = remove_outliers_iqr(df_cleaned, ['bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime'])

In [8]:
df_no_outliers.shape

(54, 20)

In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode 'sex', 'class' and other binary/categorical columns
label_cols = ['sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia', 'liver_big', 
              'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices', 'histology', 'class']

for col in label_cols:
    df_no_outliers[col] = LabelEncoder().fit_transform(df_no_outliers[col].astype(str))


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Feature selection
X = df_no_outliers.drop('class', axis=1)
y = df_no_outliers['class']  # 0 = die, 1 = live after encoding

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
logreg_acc = accuracy_score(y_test, y_pred_logreg)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nb_acc = accuracy_score(y_test, y_pred_nb)

# Compare
print("Logistic Regression Accuracy:", logreg_acc)
print("Naive Bayes Accuracy:", nb_acc)
print("\nClassification Report (LogReg):\n", classification_report(y_test, y_pred_logreg))
print("\nClassification Report (Naive Bayes):\n", classification_report(y_test, y_pred_nb))


Logistic Regression Accuracy: 0.8181818181818182
Naive Bayes Accuracy: 0.8181818181818182

Classification Report (LogReg):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.90      0.90      0.90        10

    accuracy                           0.82        11
   macro avg       0.45      0.45      0.45        11
weighted avg       0.82      0.82      0.82        11


Classification Report (Naive Bayes):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.90      0.90      0.90        10

    accuracy                           0.82        11
   macro avg       0.45      0.45      0.45        11
weighted avg       0.82      0.82      0.82        11

