In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
import time

In [12]:
# Helper Functions
def load_csv(path):
    df = pd.read_csv(path)
    return df

def load_excel(path):
    df = pd.read_excel(path)
    return df

def split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

def one_hot_encode(df,column):
    encoder = OneHotEncoder(sparse_output=False)
    encoder_array = encoder.fit_transform(df[[column]])
    encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out([column]))
    df_encoded = pd.concat([df,encoder_df], axis=1).drop(columns=column)
    return df_encoded

def logistic_regression(X_train, y_train, X_test, y_test):
    predictor = LogisticRegression()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def support_vector_classifier(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVC(kernel=kernel)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def support_vector_regressor(X_train, y_train, X_test, y_test, kernel='rbf'):
    predictor = SVR(kernel=kernel)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def decision_tree_classifier(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeClassifier()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def decision_tree_regressor(X_train, y_train, X_test, y_test):
    predictor = DecisionTreeRegressor()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def linear_regression(X_train, y_train, X_test, y_test):
    predictor = LinearRegression()
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def adaboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostClassifier(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def xgboost_classifier(X_train, y_train, X_test, y_test, n_estimators=200, random_state=42):
    predictor = XGBClassifier(n_estimators=n_estimators, random_state=random_state)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime

def catboost_classifier(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostClassifier(iterations=iterations, verbose=0, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cf = confusion_matrix(y_test, y_pred)
    return accuracy, cf, runtime 

def adaboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime 

def xgboost_regressor(X_train, y_train, X_test, y_test, n_estimators=200):
    predictor = XGBRegressor(n_estimators=n_estimators, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

def catboost_regressor(X_train, y_train, X_test, y_test, iterations=200):
    predictor = CatBoostRegressor(iterations=iterations, verbose=0, random_state=42)
    start_time = time.time()
    predictor.fit(X_train, y_train)
    end_time = time.time()
    runtime = end_time - start_time
    y_pred = predictor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    return r2, runtime

In [13]:
import os

path = r"G:\Study\iit kharagpur\ML Lab\ML_CLASS\datasets\statlog+australian+credit+approval"
data_path = os.path.join(path,"australian.dat")

### Pre-Processing


In [14]:
columns = [f"A{i+1}" for i in range(15)]
data = pd.read_csv(data_path, header=None, delimiter=' ')
data = pd.DataFrame(data.values, columns=columns)
data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,1.0,22.08,11.460,2.0,4.0,4.0,1.585,0.0,0.0,0.0,1.0,2.0,100.0,1213.0,0.0
1,0.0,22.67,7.000,2.0,8.0,4.0,0.165,0.0,0.0,0.0,0.0,2.0,160.0,1.0,0.0
2,0.0,29.58,1.750,1.0,4.0,4.0,1.250,0.0,0.0,0.0,1.0,2.0,280.0,1.0,0.0
3,0.0,21.67,11.500,1.0,5.0,3.0,0.000,1.0,1.0,11.0,1.0,2.0,0.0,1.0,1.0
4,1.0,20.17,8.170,2.0,6.0,4.0,1.960,1.0,1.0,14.0,0.0,2.0,60.0,159.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1.0,31.57,10.500,2.0,14.0,4.0,6.500,1.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0
686,1.0,20.67,0.415,2.0,8.0,4.0,0.125,0.0,0.0,0.0,0.0,2.0,0.0,45.0,0.0
687,0.0,18.83,9.540,2.0,6.0,4.0,0.085,1.0,0.0,0.0,0.0,2.0,100.0,1.0,1.0
688,0.0,27.42,14.500,2.0,14.0,8.0,3.085,1.0,1.0,1.0,0.0,2.0,120.0,12.0,1.0


In [15]:
y = data.iloc[:,-1]
X = data.iloc[:,:-1]

In [16]:
numerical_cols = ["A2",'A3','A5','A6','A13','A14','A7','A10']

X_train, X_test, y_train, y_test = split(X,y)
X_train[numerical_cols], X_test[numerical_cols] = scale(X_train[numerical_cols], X_test[numerical_cols])

In [17]:
X_train

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
278,1.0,0.106376,-0.441050,2.0,1.499132,1.674707,0.167881,1.0,1.0,0.802619,0.0,2.0,-0.140272,0.267609
110,1.0,0.564015,1.114900,1.0,-0.943534,-0.347028,-0.623684,0.0,0.0,-0.567626,0.0,2.0,-0.140272,-0.254682
82,0.0,-1.096825,1.089392,2.0,-1.486348,-0.347028,-0.536235,0.0,0.0,-0.567626,0.0,2.0,-0.605529,-0.244095
51,1.0,-0.885737,-0.109454,1.0,-1.214941,1.674707,-0.649316,0.0,0.0,-0.567626,0.0,2.0,-0.256587,-0.229475
218,1.0,-1.019145,-0.892021,2.0,-0.943534,-0.347028,-0.573929,0.0,0.0,-0.567626,0.0,2.0,0.557613,-0.162928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.0,-0.097113,-0.670617,2.0,0.413503,1.674707,-0.649316,0.0,1.0,0.117497,0.0,2.0,-1.070786,-0.246363
106,1.0,-0.329309,-0.543080,2.0,1.770539,1.674707,-0.359829,1.0,1.0,0.345871,0.0,2.0,-0.256587,1.646939
270,0.0,-1.047008,0.153782,2.0,-1.214941,1.674707,-0.573929,0.0,0.0,-0.567626,0.0,2.0,-0.605529,-0.132680
435,1.0,-1.307913,-0.798154,2.0,0.142096,-0.347028,-0.133668,1.0,1.0,0.574245,1.0,2.0,0.976344,-0.080753


In [18]:
# Logistic Regression
acc_lr, cf_lr, runtime = logistic_regression(X_train, y_train, X_test, y_test)
print(f"The accuracy of Logistic Regression is {acc_lr*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# Decision Tree
acc_dt, cf_dt, runtime = decision_tree_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Decision Tree is {acc_dt*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# Support Vector Machine
acc_svm, cf_svm, runtime = support_vector_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of Support Vector Classifier is {acc_svm*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# AdaBoost
acc_ada, cf_ada, runtime = adaboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of AdaBoost is {acc_ada*100:.2f}")
print(f"The runtime is {runtime:.2f}")

# XGBoost
acc_xg, cf_xg, runtime = xgboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of XGBoost is {acc_xg*100:.2f}")
print(f"The runtime is {runtime:.2f}")

#CatBoost
acc_cat, cf_cat, runtime = catboost_classifier(X_train, y_train, X_test, y_test)
print(f"The accuracy of CatBoost is {acc_cat*100:.2f}")
print(f"The runtime is {runtime:.2f}")

The accuracy of Logistic Regression is 86.96
The runtime is 0.01
The accuracy of Decision Tree is 88.41
The runtime is 0.00
The accuracy of Support Vector Classifier is 84.78
The runtime is 0.00
The accuracy of AdaBoost is 86.96
The runtime is 0.32
The accuracy of XGBoost is 86.23
The runtime is 0.08
The accuracy of CatBoost is 87.68
The runtime is 0.35


In [19]:
confusion_matrices = [cf_lr, cf_dt, cf_svm, cf_ada, cf_xg, cf_cat]
for i in confusion_matrices:
    print(i)
    print('\n')

[[78  9]
 [ 9 42]]


[[80  7]
 [ 9 42]]


[[74 13]
 [ 8 43]]


[[81  6]
 [12 39]]


[[78  9]
 [10 41]]


[[79  8]
 [ 9 42]]


