In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder

In [18]:
# Helper Functions
def load_csv(path):
    df = pd.read_csv(path)
    return df

def load_excel(path):
    df = pd.read_excel(path)
    return df

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def one_hot_encode(df,column):
    encoder = OneHotEncoder(sparse_output=False)
    encoder_array = encoder.fit_transform(df[[column]])
    encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out([column]))
    df_encoded = pd.concat([df,encoder_df], axis=1).drop(columns=column)
    return df_encoded

def logistic_regression(X_train, y_train, X_test, y_test):
    predictor = LogisticRegression()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def support_vector_classifier(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVC(kernel=kernel)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def support_vector_regressor(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVR(kernel=kernel)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeClassifier()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy    

def decision_tree_regressor(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeRegressor()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2 

def linear_regression(X_train, y_train, X_test, y_test):
    predictor = LinearRegression()
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2

def adaboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy 

def xgboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200, random_state=42):
    predictor = XGBClassifier(n_estimators=n_estimators, random_state=random_state)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy 

def catboost_classifier(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostClassifier(iterations=iterations, verbose=0, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def adaboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

def xgboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = XGBRegressor(n_estimators=n_estimators, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

def catboost_regressor(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostRegressor(iterations=iterations, verbose=0, random_state=42)
    predictor.fit(X_train, y_train)
    y_pred = predictor.predict(X_test)   
    r2 = r2_score(y_test,y_pred)
    return r2 

In [19]:
import os

path = r"G:\Study\iit kharagpur\ML Lab\Lab 1\ML_Class_Tutorial\datasets\heart"
data_path = os.path.join(path,"heart.dat")

### Pre-Processing


In [20]:
columns = [f'A{i+1}' for i in range(14)]
data = pd.read_csv(data_path, header=None, delimiter=' ')
data = pd.DataFrame(data.values, columns=columns)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2.0
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1.0
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2.0
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1.0
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,1.0
266,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,1.0
267,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,1.0
268,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,1.0


In [21]:
mapping = {1:0, 2:1}
data['A14'] = data['A14'].map(mapping)

mapping = {3:0, 6:1, 7:2}
data['A13'] = data['A13'].map(mapping)

data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,0,1
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,2,0
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,2,1
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,2,0
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,2,0
266,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,2,0
267,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,0,0
268,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,1,0


In [22]:
y = data.iloc[:,-1]
X = data.iloc[:,:-1]

In [26]:
numerical_cols = ['A1','A4','A5','A8','A10']

X_train, X_test, y_train, y_test = split(X,y)
X_train[numerical_cols], X_test[numerical_cols] = scale(X_train[numerical_cols], X_test[numerical_cols])

In [30]:
X_train

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13
115,-0.632970,0.0,2.0,0.125095,0.406391,0.0,0.0,0.566128,0.0,-0.934304,2.0,0.0,0
33,0.465195,1.0,4.0,2.143222,1.478549,0.0,2.0,-0.397771,1.0,1.959634,3.0,0.0,2
184,-0.193704,1.0,3.0,-0.099142,-0.080953,1.0,2.0,1.048078,0.0,-0.934304,1.0,3.0,0
142,-0.523153,1.0,3.0,0.461449,-0.334372,0.0,0.0,0.609942,0.0,-0.423609,2.0,1.0,2
197,-0.083888,0.0,3.0,-1.220323,-0.704754,0.0,0.0,0.390874,0.0,0.427549,2.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,1.343726,1.0,4.0,-0.659732,-0.412348,0.0,2.0,-0.879720,1.0,1.278708,2.0,2.0,2
188,0.794644,0.0,4.0,0.461449,2.804126,0.0,2.0,0.347060,0.0,0.087086,2.0,0.0,0
71,0.245562,0.0,4.0,-0.659732,2.024375,0.0,0.0,0.609942,1.0,-0.423609,1.0,0.0,0
106,-0.413337,1.0,3.0,-1.780914,-0.548804,0.0,0.0,-0.266330,1.0,0.087086,2.0,0.0,0


In [31]:
# Logistic Regression
acc_lr = logistic_regression(X_train, y_train, X_test, y_test)
print(f"The accuracy of Logistic Regression is {acc_lr*100:.2f}")

# Decision Tree
acc_dt = decision_tree_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Decision Tree is {acc_dt*100:.2f}")

# Support Vector Machine
acc_svm = support_vector_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Support Vector Classifier is {acc_svm*100:.2f}")

# AdaBoost
acc_ada  = adaboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of AdaBoost is {acc_ada*100:.2f}")

# XGBoost
acc_xg = xgboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of XGBoost is {acc_xg*100:.2f}")

#CatBoost
acc_cat = catboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of CatBoost is {acc_cat*100:.2f}")

The accuracy of Logistic Regression is 92.59
The accuracy of Decision Tree is 74.07
The accuracy of Support Vector Classifier is 85.19
The accuracy of AdaBoost is 81.48
The accuracy of XGBoost is 83.33
The accuracy of CatBoost is 88.89
