In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
data_folder = "./pima-5-fold"
def load_keel_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data_start = next(i for i, line in enumerate(lines) if "@data" in line.lower())
    data = pd.read_csv(file_path, skiprows=data_start + 1, header=None)
    attribute_lines = [line for line in lines if line.lower().startswith("@attribute")]
    column_names = [line.split()[1] for line in attribute_lines]
    data.columns = column_names
    return data
# A. Accuracy using 5-fold cross-validation
fold_accuracies = []
combined_train_data = []
for fold in range(1, 6):
    train_file = os.path.join(data_folder, f"pima-5-{fold}tra.dat")
    test_file = os.path.join(data_folder, f"pima-5-{fold}tst.dat")

    train_data = load_keel_file(train_file)
    test_data = load_keel_file(test_file)

    combined_train_data.append(train_data)

    X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
    X_test, y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]

    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    fold_accuracies.append(accuracy_score(y_test, y_pred))

accuracy_5cv = np.mean(fold_accuracies)

# Combine all
combined_train_data = pd.concat(combined_train_data, ignore_index=True)
X_all = combined_train_data.iloc[:, :-1]
y_all = combined_train_data.iloc[:, -1]

# B1. Accuracy for 70-30 split
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X_all, y_all, test_size=0.3, random_state=42)
knn_70 = KNeighborsClassifier()
knn_70.fit(X_train_70, y_train_70)
accuracy_7030 = knn_70.score(X_test_70, y_test_70)

# B2. Accuracy for 80-20 split
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
knn_80 = KNeighborsClassifier()
knn_80.fit(X_train_80, y_train_80)
accuracy_8020 = knn_80.score(X_test_80, y_test_80)

print(f"A. Accuracy with 5-fold cross-validation: {accuracy_5cv}")
print(f"B1. Accuracy with 70-30 split: {accuracy_7030}")
print(f"B2. Accuracy with 80-20 split: {accuracy_8020}")


A. Accuracy with 5-fold cross-validation: 0.7213479331126391
B1. Accuracy with 70-30 split: 0.8015184381778742
B2. Accuracy with 80-20 split: 0.8260162601626017
