In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy.stats import zscore

df = pd.read_csv('hepatitis_csv.csv')

print(df.isnull().sum())

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64


In [3]:
df.shape

(155, 20)

In [7]:
# --- q. Data Cleaning ---
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)  # drop rows with any NA values

# Strip whitespace
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].map(lambda x: x.strip() if isinstance(x, str) else x)

# Convert all numeric columns to appropriate type
for col in df.columns:
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

# Remove negative values (if any) from numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    df = df[df[col] >= 0]

df.shape

(80, 20)

In [8]:
# --- r. Error Correction: Remove Outliers using Z-score ---
numeric_cols = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

df.shape

(74, 20)

In [9]:
# --- s. Data Transformation ---
# Encode categorical columns (if any)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [10]:
# --- t. Model Building ---

# Logistic Regression
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_acc = accuracy_score(y_test, log_pred) * 100

# Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred) * 100


In [11]:
# --- Results ---
print("=== Hepatitis Dataset Classification ===")
print(f"Logistic Regression Accuracy: {log_acc:.2f}%")
print(f"Naive Bayes Accuracy: {nb_acc:.2f}%")

if log_acc > nb_acc:
    print("→ Logistic Regression performed better.")
elif nb_acc > log_acc:
    print("→ Naive Bayes performed better.")
else:
    print("→ Both models performed equally.")


=== Hepatitis Dataset Classification ===
Logistic Regression Accuracy: 80.00%
Naive Bayes Accuracy: 93.33%
→ Naive Bayes performed better.
