In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder

In [32]:
# Helper Functions
def load_csv(path):
    df = pd.read_csv(path)
    return df

def load_excel(path):
    df = pd.read_excel(path)
    return df

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def one_hot_encode(df,column):
    encoder = OneHotEncoder(sparse_output=False)
    encoder_array = encoder.fit_transform(df[[column]])
    encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out([column]))
    df_encoded = pd.concat([df,encoder_df], axis=1).drop(columns=column)
    return df_encoded

def logistic_regression(X_train, y_train, X_test, y_test):
    predictor = LogisticRegression()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def support_vector_classifier(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVC(kernel=kernel)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def support_vector_regressor(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVR(kernel=kernel)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeClassifier()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy    

def decision_tree_regressor(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeRegressor()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2 

def linear_regression(X_train, y_train, X_test, y_test):
    predictor = LinearRegression()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

def adaboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy 

def xgboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200, random_state=42):
    predictor = XGBClassifier(n_estimators=n_estimators, random_state=random_state)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy 

def catboost_classifier(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostClassifier(iterations=iterations, verbose=0, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def adaboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

def xgboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = XGBRegressor(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

def catboost_regressor(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostRegressor(iterations=iterations, verbose=0, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

In [22]:
import os

path = r"G:\Study\iit kharagpur\ML Lab\Lab 1\ML_Class_Tutorial\datasets\hepatitis"
data_path = os.path.join(path,"hepatitis.data")
names_path = os.path.join(path, "hepatitis.names")

### Pre-Processing


In [23]:
columns = ['Class','AGE','SEX','STEROID','ANTIVIRALS','FATIGUE',"MALAISE","ANOREXIA",'LIVER BIG',"LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITIS","VARICES","BILURUBIN","ALK PHOSPHATE","SGOT","ALBUMIN",'PROTIME','HISTOLOGY']
data = pd.read_csv(data_path, header=None, na_values='?')
data = pd.DataFrame(data.values, columns=columns)
data

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2.0,30.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1.0
1,2.0,50.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1.0
2,2.0,78.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1.0
3,2.0,31.0,1.0,,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1.0
4,2.0,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1.0,46.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,7.6,,242.0,3.3,50.0,2.0
151,2.0,44.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.9,126.0,142.0,4.3,,2.0
152,2.0,61.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.8,75.0,20.0,4.1,,2.0
153,2.0,53.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.5,81.0,19.0,4.1,48.0,2.0


In [24]:
data.isna().sum()

Class               0
AGE                 0
SEX                 0
STEROID             1
ANTIVIRALS          0
FATIGUE             1
MALAISE             1
ANOREXIA            1
LIVER BIG          10
LIVER FIRM         11
SPLEEN PALPABLE     5
SPIDERS             5
ASCITIS             5
VARICES             5
BILURUBIN           6
ALK PHOSPHATE      29
SGOT                4
ALBUMIN            16
PROTIME            67
HISTOLOGY           0
dtype: int64

In [25]:
impute_zero_columns = ['STEROID','ANTIVIRALS','FATIGUE',"MALAISE","ANOREXIA",'LIVER BIG',"LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITIS","VARICES"]
impute_mean_columns = ["BILURUBIN","ALK PHOSPHATE","SGOT","ALBUMIN",'PROTIME','HISTOLOGY']

for column in impute_zero_columns:
    data[column] = data[column].fillna(1.0)

for column in impute_mean_columns:
    mean = data[column].mean()
    data[column] = data[column].fillna(np.round(mean))

data

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,2.0,30.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,62.0,1.0
1,2.0,50.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,62.0,1.0
2,2.0,78.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,62.0,1.0
3,2.0,31.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1.0
4,2.0,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,105.0,200.0,4.0,62.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1.0,46.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,7.6,105.0,242.0,3.3,50.0,2.0
151,2.0,44.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.9,126.0,142.0,4.3,62.0,2.0
152,2.0,61.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,0.8,75.0,20.0,4.1,62.0,2.0
153,2.0,53.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.5,81.0,19.0,4.1,48.0,2.0


In [26]:
mapping_columns = ['Class','SEX','STEROID','ANTIVIRALS','FATIGUE',"MALAISE","ANOREXIA",'LIVER BIG',"LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITIS","VARICES","HISTOLOGY"]
mapping = {1:0, 2:1}

for column in mapping_columns:
    data[column] = data[column].map(mapping)

data

Unnamed: 0,Class,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,1,30.0,1,0,1,1,1,1,0,1,1,1,1,1,1.0,85.0,18.0,4.0,62.0,0
1,1,50.0,0,0,1,0,1,1,0,1,1,1,1,1,0.9,135.0,42.0,3.5,62.0,0
2,1,78.0,0,1,1,0,1,1,1,1,1,1,1,1,0.7,96.0,32.0,4.0,62.0,0
3,1,31.0,0,0,0,1,1,1,1,1,1,1,1,1,0.7,46.0,52.0,4.0,80.0,0
4,1,34.0,0,1,1,1,1,1,1,1,1,1,1,1,1.0,105.0,200.0,4.0,62.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,0,46.0,0,1,1,0,0,0,1,1,1,0,0,0,7.6,105.0,242.0,3.3,50.0,1
151,1,44.0,0,1,1,0,1,1,1,0,1,1,1,1,0.9,126.0,142.0,4.3,62.0,1
152,1,61.0,0,0,1,0,0,1,0,0,1,0,1,1,0.8,75.0,20.0,4.1,62.0,1
153,1,53.0,1,0,1,0,1,1,1,1,0,0,1,0,1.5,81.0,19.0,4.1,48.0,1


In [27]:
y = data.iloc[:,0]
X = data.iloc[:,1:]

In [28]:
numerical_cols = ["AGE","BILURUBIN", "ALK PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME"]

X_train, X_test, y_train, y_test = split(X,y)
X_train[numerical_cols], X_test[numerical_cols] = scale(X_train[numerical_cols], X_test[numerical_cols])

In [29]:
X_train

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITIS,VARICES,BILURUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
96,-0.844602,0,0,1,0,0,1,1,0,1,0,1,1,-0.548111,0.904777,0.488650,0.089394,2.132909,1
122,0.090493,0,1,1,1,1,1,1,1,0,1,1,1,0.055646,-0.505017,-0.521609,0.250045,-0.035433,1
82,-0.844602,0,0,1,1,1,1,1,1,1,1,1,1,-0.634362,-0.163938,-0.624931,0.250045,2.132909,0
109,-0.610829,0,0,1,0,0,1,1,1,1,1,0,1,-0.634362,-1.005267,-0.062401,-1.356460,-1.804344,1
65,-1.078376,0,1,1,1,1,1,1,1,1,1,1,1,-0.548111,-0.050245,-0.544570,0.571346,-0.035433,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.532904,0,0,1,0,0,1,0,0,1,0,1,1,1.176909,0.450005,1.108582,0.250045,-0.035433,0
106,-0.454979,0,0,1,0,1,1,0,0,0,0,0,1,0.055646,0.700129,-0.314965,-1.999062,-0.035433,1
14,0.480116,0,0,0,1,1,1,1,1,1,1,1,1,-0.375609,-0.050245,-0.292005,0.250045,-0.035433,0
92,-0.610829,0,1,1,1,1,1,0,0,1,1,1,1,-0.375609,-0.050245,-0.292005,0.250045,-0.035433,1


In [34]:
# Logistic Regression
acc_lr = logistic_regression(X_train, y_train, X_test, y_test)
print(f"The accuracy of Logistic Regression is {acc_lr*100:.2f}")

# Decision Tree
acc_dt = decision_tree_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Decision Tree is {acc_dt*100:.2f}")

# Support Vector Machine
acc_svm = support_vector_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Support Vector Classifier is {acc_svm*100:.2f}")

# AdaBoost
acc_ada  = adaboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of AdaBoost is {acc_ada*100:.2f}")

# XGBoost
acc_xg = xgboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of XGBoost is {acc_xg*100:.2f}")

#CatBoost
acc_cat = catboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of CatBoost is {acc_cat*100:.2f}")

The accuracy of Logistic Regression is 83.87
The accuracy of Decision Tree is 80.65
The accuracy of Support Vector Classifier is 74.19
The accuracy of AdaBoost is 77.42
The accuracy of XGBoost is 74.19
The accuracy of CatBoost is 80.65
