In [205]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
import time

In [206]:
# Helper Functions
def load_csv(path):
    df = pd.read_csv(path)
    return df

def load_excel(path):
    df = pd.read_excel(path)
    return df

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def one_hot_encode(df,column):
    encoder = OneHotEncoder(sparse_output=False)
    encoder_array = encoder.fit_transform(df[[column]])
    encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out([column]))
    df_encoded = pd.concat([df,encoder_df], axis=1).drop(columns=column)
    return df_encoded

def logistic_regression(X_train, y_train, X_test, y_test):
    predictor = LogisticRegression()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def support_vector_classifier(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVC(kernel=kernel)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def support_vector_regressor(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVR(kernel=kernel)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeClassifier()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def decision_tree_regressor(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeRegressor()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def linear_regression(X_train, y_train, X_test, y_test):
    predictor = LinearRegression()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def adaboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def xgboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200, random_state=42):
    predictor = XGBClassifier(n_estimators=n_estimators, random_state=random_state)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def catboost_classifier(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostClassifier(iterations=iterations, verbose=0, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime 

def adaboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime 

def xgboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = XGBRegressor(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def catboost_regressor(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostRegressor(iterations=iterations, verbose=0, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

In [207]:
import os

path = r"G:\Study\iit kharagpur\ML Lab\ML_CLASS\datasets\hepatitis"
data_path = os.path.join(path,"hepatitis.data")
names_path = os.path.join(path, "hepatitis.names")

### Pre-Processing


In [208]:
columns = ['Class','AGE','SEX','STEROID','ANTIVIRALS','FATIGUE',"MALAISE","ANOREXIA",'LIVER BIG',"LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITIS","VARICES","BILURUBIN","ALK PHOSPHATE","SGOT","ALBUMIN",'PROTIME','HISTOLOGY']
data = pd.read_csv(data_path, header=None, na_values='?')
data = pd.DataFrame(data.values, columns=columns)
data

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2.0,30.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1.0
1,2.0,50.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1.0
2,2.0,78.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1.0
3,2.0,31.0,1.0,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1.0
4,2.0,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1.0,46.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,7.6,,242.0,3.3,50.0,2.0
151,2.0,44.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.9,126.0,142.0,4.3,,2.0
152,2.0,61.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.8,75.0,20.0,4.1,,2.0
153,2.0,53.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.5,81.0,19.0,4.1,48.0,2.0


In [209]:
data.isna().sum()

Class               0
AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER BIG          10
LIVER FIRM         11
SPLEEN PALPABLE     5
SPIDERS             5
ASCITIS             5
VARICES             5
BILURUBIN           6
ALK PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
dtype: int64

In [210]:
impute_zero_columns = ['STEROID','ANTIVIRALS','FATIGUE',"MALAISE","ANOREXIA",'LIVER BIG',"LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITIS","VARICES"]
impute_mean_columns = ["BILURUBIN","ALK PHOSPHATE","SGOT","ALBUMIN",'PROTIME','HISTOLOGY']

for column in impute_zero_columns:
    data[column] = data[column].fillna(1.0)

for column in impute_mean_columns:
    mean = data[column].mean()
    data[column] = data[column].fillna(np.round(mean))

data

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2.0,30.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,62.0,1.0
1,2.0,50.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,62.0,1.0
2,2.0,78.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,62.0,1.0
3,2.0,31.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1.0
4,2.0,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,105.0,200.0,4.0,62.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1.0,46.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,7.6,105.0,242.0,3.3,50.0,2.0
151,2.0,44.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.9,126.0,142.0,4.3,62.0,2.0
152,2.0,61.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.8,75.0,20.0,4.1,62.0,2.0
153,2.0,53.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.5,81.0,19.0,4.1,48.0,2.0


In [211]:
mapping_columns = ['Class','SEX','STEROID','ANTIVIRALS','FATIGUE',"MALAISE","ANOREXIA",'LIVER BIG',"LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITIS","VARICES","HISTOLOGY"]
mapping = {1:0, 2:1}

for column in mapping_columns:
    data[column] = data[column].map(mapping)

data

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,1,30.0,1,0,1,1,1,1,0,1,1,1,1,1,1.0,85.0,18.0,4.0,62.0,0
1,1,50.0,0,0,1,0,1,1,0,1,1,1,1,1,0.9,135.0,42.0,3.5,62.0,0
2,1,78.0,0,1,1,0,1,1,1,1,1,1,1,1,0.7,96.0,32.0,4.0,62.0,0
3,1,31.0,0,0,0,1,1,1,1,1,1,1,1,1,0.7,46.0,52.0,4.0,80.0,0
4,1,34.0,0,1,1,1,1,1,1,1,1,1,1,1,1.0,105.0,200.0,4.0,62.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,0,46.0,0,1,1,0,0,0,1,1,1,0,0,0,7.6,105.0,242.0,3.3,50.0,1
151,1,44.0,0,1,1,0,1,1,1,0,1,1,1,1,0.9,126.0,142.0,4.3,62.0,1
152,1,61.0,0,0,1,0,0,1,0,0,1,0,1,1,0.8,75.0,20.0,4.1,62.0,1
153,1,53.0,1,0,1,0,1,1,1,1,0,0,1,0,1.5,81.0,19.0,4.1,48.0,1


In [212]:
y = data.iloc[:,0]
X = data.iloc[:,1:]

In [213]:
# Using PCA using only top 10 features
from sklearn.decomposition import PCA

X = X.apply(pd.to_numeric, errors='coerce')
data_numeric = X.select_dtypes(include=['number']).dropna()

n_components=10
pca = PCA(n_components=n_components)
pca_features = pca.fit_transform(data_numeric)

X = pd.DataFrame(pca_features, columns=[f"PC{i+1}" for i in range(n_components)])
X

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,-70.132235,-10.590926,-0.262132,-10.768030,-0.284089,0.164592,-0.073675,0.132659,-0.811232,-0.196641
1,-39.295008,35.425949,-0.843336,9.415212,-0.666998,0.112084,0.379764,0.202799,-0.623386,-0.225446
2,-54.206310,-1.803050,-10.207458,36.030289,-1.122010,0.859563,0.321243,0.350930,0.530471,-0.193606
3,-42.333273,-55.068634,15.021105,-6.595584,-0.282638,0.122723,0.404363,-0.191617,-0.622263,-0.172430
4,112.842827,-16.066463,4.295127,-7.809548,-0.958841,0.973799,0.124479,-0.210950,0.181775,-0.267943
...,...,...,...,...,...,...,...,...,...,...
150,154.952182,-21.070229,-9.283650,0.868141,5.653286,1.302991,0.027016,0.340282,0.558740,0.237579
151,58.411098,12.694496,2.153383,2.711397,-0.916137,0.333276,-0.493091,-0.088384,0.216852,0.292879
152,-69.154191,-20.827880,-8.052709,19.226356,-0.302750,-1.146790,-0.562242,0.285312,-0.267863,0.651135
153,-68.997638,-13.781682,-19.414723,8.337793,0.194196,-0.119965,-0.372229,-0.196322,0.082110,0.499393


In [214]:
X_train, X_test, y_train, y_test = split(X,y)
X_train, X_test= scale(X_train, X_test)

In [215]:
X_train.shape

(124, 10)

In [216]:
# Logistic Regression
acc_lr, cf_lr, runtime = logistic_regression(X_train, y_train, X_test, y_test)
print(f"The accuracy of Logistic Regression is {acc_lr*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# Decision Tree
acc_dt, cf_dt, runtime = decision_tree_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Decision Tree is {acc_dt*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# Support Vector Machine
acc_svm, cf_svm, runtime = support_vector_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Support Vector Classifier is {acc_svm*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# AdaBoost
acc_ada, cf_ada, runtime = adaboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of AdaBoost is {acc_ada*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# XGBoost
acc_xg, cf_xg, runtime = xgboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of XGBoost is {acc_xg*100:.2f}")
print(f"The runtime is {runtime:.2f}")

#CatBoost
acc_cat, cf_cat, runtime = catboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of CatBoost is {acc_cat*100:.2f}")
print(f"The runtime is {runtime:.2f}")

The accuracy of Logistic Regression is 83.87
The runtime is 0.00
The accuracy of Decision Tree is 80.65
The runtime is 0.00
The accuracy of Support Vector Classifier is 74.19
The runtime is 0.00
The accuracy of AdaBoost is 67.74
The runtime is 0.31
The accuracy of XGBoost is 77.42
The runtime is 0.08
The accuracy of CatBoost is 80.65
The runtime is 0.42


In [218]:
confusion_matrices = [cf_lr, cf_dt, cf_xg, cf_cat]
for i in confusion_matrices:
    print(i)
    print('\n')

[[ 3  4]
 [ 1 23]]


[[ 3  4]
 [ 2 22]]


[[ 2  5]
 [ 2 22]]


[[ 1  6]
 [ 0 24]]


