In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv("DLBCL-2.csv")

X = df.drop(columns='target').values
y = df['target'].values

n_splits = 5
indices = np.arange(len(X))
np.random.shuffle(indices)

fold_indices = np.array_split(indices, n_splits)

accuracy_scores = []

for i in range(n_splits):
    test_indices = fold_indices[i]
    train_indices = np.concatenate(fold_indices[:i] + fold_indices[i+1:])

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    from sklearn.preprocessing import LabelEncoder

    # Encode the target variable 'y_train' into numerical values
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # Then use y_train_encoded instead of y_train in the knn_predict function
    def knn_predict(X_train, y_train, x_test, k=5):
        distances = np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))
        indices = np.argsort(distances)[:k]
        neighbors = y_train[indices]
        return np.bincount(neighbors).argmax()

    y_pred = [knn_predict(X_train, y_train_encoded, x_test) for x_test in X_test]

    # Decode integer labels back to string labels
    y_pred_labels = label_encoder.inverse_transform(y_pred)

    # Calculate and print accuracy
    accuracy = accuracy_score(y_test, y_pred_labels)
    accuracy_scores.append(accuracy)

# Print average accuracy across all folds
print("Average Accuracy: {:.2f}%".format(np.mean(accuracy_scores) * 100))


Average Accuracy: 99.09%


In [2]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
tp = cm[0, 0]
fp = cm[0, 1]
fn = cm[1, 0]
tn = cm[1, 1]
accuracy=(tp+tn)/(tp+fp+fn+tn)
print(accuracy)

0.6993464052287581
