In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


In [41]:
column_names = [
    'Class', 'AGE', 'SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE',
    'ANOREXIA', 'LIVER_BIG', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'SPIDERS',
    'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK_PHOSPHATE', 'SGOT', 'ALBUMIN',
    'PROTIME', 'HISTOLOGY'
]

df=pd.read_csv('./DSBDALExam DataSets/Hepatitis/hepatitis.csv',header=None,names=column_names)
df

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.00,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.90,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.70,96,32,4.0,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.70,46,52,4.0,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.00,?,200,4.0,?,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1,46,1,2,2,1,1,1,2,2,2,1,1,1,7.60,?,242,3.3,50,2
151,2,44,1,2,2,1,2,2,2,1,2,2,2,2,0.90,126,142,4.3,?,2
152,2,61,1,1,2,1,1,2,1,1,2,1,2,2,0.80,75,20,4.1,?,2
153,2,53,2,1,2,1,2,2,2,2,1,1,2,1,1.50,81,19,4.1,48,2


In [42]:
#Data Cleaning
df.replace('?',np.nan,inplace=True)
df.dropna(inplace=True)
df=df.apply(pd.to_numeric)

for column in df.columns:
    df=df[df[column]>=0]
df_cleaned=df
df_cleaned


Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER_BIG,LIVER_FIRM,SPLEEN_PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK_PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
5,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1
10,2,39,1,1,1,2,2,2,1,1,2,2,2,2,1.3,78,30,4.4,85,1
11,2,32,1,2,1,1,2,2,2,1,2,1,2,2,1.0,59,249,3.7,54,1
12,2,41,1,2,1,1,2,2,2,1,2,2,2,2,0.9,81,60,3.9,52,1
13,2,30,1,2,2,1,2,2,2,1,2,2,2,2,2.2,57,144,4.9,78,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,2,45,1,2,1,2,2,2,2,2,2,2,2,2,1.3,85,44,4.2,85,2
143,1,49,1,1,2,1,1,2,2,2,1,1,2,2,1.4,85,70,3.5,35,2
145,2,31,1,1,2,1,2,2,2,2,2,2,2,2,1.2,75,173,4.2,54,2
153,2,53,2,1,2,1,2,2,2,2,1,1,2,1,1.5,81,19,4.1,48,2


In [43]:
from scipy.stats import zscore
numerical_cols = ['AGE', 'BILIRUBIN', 'ALK_PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME']
z_scores = np.abs(zscore(df[numerical_cols]))
df_cleaned = df[(z_scores < 3).all(axis=1)]


In [37]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

df_transformed = df_cleaned.copy()
le = LabelEncoder()

# Encode categorical columns
categorical_cols = ['SEX', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE',
                    'ANOREXIA', 'LIVER_BIG', 'LIVER_FIRM', 'SPLEEN_PALPABLE',
                    'SPIDERS', 'ASCITES', 'VARICES', 'HISTOLOGY', 'Class']

for col in categorical_cols:
    df_transformed[col] = le.fit_transform(df_transformed[col])

# Scale numerical features
scaler = StandardScaler()
df_transformed[numerical_cols] = scaler.fit_transform(df_transformed[numerical_cols])


In [44]:
X=df_transformed.drop('Class',axis=1)
y=df_transformed['Class']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


#Splitting the dataset into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

#Naive Bayes
nb=GaussianNB()
nb.fit(X_train,y_train)
y_pred_nb=nb.predict(X_test)

# Accuracy and classification reports
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

print("\n--- Logistic Regression Report ---\n", classification_report(y_test, y_pred_lr))
print("\n--- Naive Bayes Report ---\n", classification_report(y_test, y_pred_nb))


Logistic Regression Accuracy: 0.8
Naive Bayes Accuracy: 0.9333333333333333

--- Logistic Regression Report ---
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.80      1.00      0.89        12

    accuracy                           0.80        15
   macro avg       0.40      0.50      0.44        15
weighted avg       0.64      0.80      0.71        15


--- Naive Bayes Report ---
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.92      0.96        12

    accuracy                           0.93        15
   macro avg       0.88      0.96      0.91        15
weighted avg       0.95      0.93      0.94        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



🧠 1. When and Why to Use z-score (Outlier Removal)
✔ Use z-score when:
You're working with numerical columns that are normally distributed or close to it.
You want to remove extreme values (outliers) that could negatively impact model training.
In Heart Disease, we applied z-score to all numerical columns because:
Most columns were already numeric.
Outliers (like very high blood pressure or cholesterol) can skew predictions.
🔎 z-score applies only to numerical columns — not to categorical ones.



🧠 2. When to Use Label Encoding
✔ Use Label Encoding when:
You have categorical data (like 'male', 'female', 'yes', 'no', etc.).
You’re using models that require numerical input (like Logistic Regression, KNN, Naive Bayes).
In Heart Disease, we didn’t use Label Encoding because:
    The dataset was already numeric, even for categorical-looking columns (e.g., sex was already 0/1, cp was 1–4).
    No string values needed encoding.

In Hepatitis, many features are still strings or marked with '?', so we must:
Replace '?' with NaN
Use LabelEncoder to convert categories to numbers.
🧠 If the dataset is already encoded, skip label encoding!


🧠 3. When to Use StandardScaler (Feature Scaling)
✔ Use StandardScaler when:
You're using algorithms that are sensitive to feature scale: KNN, Logistic Regression, SVM, PCA, etc.
You want features to have mean = 0 and std = 1 for better convergence and accuracy.
In Heart Disease, we scaled all numerical features because:
KNN is distance-based and needs scaled input.
Logistic Regression also benefits from scaling.
In Hepatitis, same logic applies: if your features vary a lot (e.g., age vs. liver enzyme values), scaling is essential.


🧠 Pro Tip:
If values are already 0/1, no need for label encoding.
If you're using tree-based models (e.g., Random Forest), scaling is not necessary.
For regression/KNN, always scale numerical data