In [813]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold
import numpy as np

## Part 1

In [814]:
df = pd.read_csv("cleveland.csv")

In [815]:
df = df.rename({"num": "disease"}, axis=1)
df["disease"] = df["disease"].apply(lambda x: min(x, 1))
df
df.replace("?", pd.NA, inplace=True)
df = df.dropna()

In [816]:
X = df[["age", "sex", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]].values
y = df["disease"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)


## k-nearest neighbors function

In [817]:
def knn(n_neighbors, X_train, y_train, X_test, y_test):
    nn = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean", algorithm="auto")

    fit = nn.fit(X_train)

    distances, indices = fit.kneighbors(X_test)
    y_pred = []

    # print(list(y_test))

    # for i in range(len(X_test)):
    #     zeros = list(y_train.iloc[indices[i]]).count(0)
    #     ones = list(y_train.iloc[indices[i]]).count(1)
    #     if ones > zeros:
    #         y_pred.append(1)
    #     else:
    #         y_pred.append(0)
    for i in range(len(X_test)):
        zeros = list(y_train[indices[i]]).count(0)
        ones = list(y_train[indices[i]]).count(1)
        if ones > zeros:
            y_pred.append(1)
        # In case of equality of numbers we predict zero
        else:
            y_pred.append(0)
    
    (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])
    return [p, r, f, s]
    

In [818]:
result = knn(5, X_train, y_train, X_test, y_test)
print(f'precision={result[0]}, recall={result[1]}, f-score={result[2]}, support={result[3]}')

precision=[0.625      0.71428571], recall=[0.71428571 0.625     ], f-score=[0.66666667 0.66666667], support=[28 32]


### Implementation of K-fold cross validation

In [820]:
k_fold = 10

# Calculate the number of samples per fold
fold_size = len(X) // k_fold

# Shuffle the dataset
X = np.array(X)
y = np.array(y)

# Shuffle the indices array
indices = np.arange(len(X))
np.random.shuffle(indices)

# Use the shuffled indices to shuffle both X and y
X_shuffled = X[indices]
y_shuffled = y[indices]

precision_0 = []
precision_1 = []
recall_0 = []
recall_1 = []
f_score_0 = []
f_score_1 = []
support_0 = []
support_1 = []

for fold in range(k_fold):
    # Split the dataset into training and testing sets for this fold
    test_start = fold * fold_size
    test_end = (fold + 1) * fold_size
    X_test_fold = X_shuffled[test_start:test_end]
    y_test_fold = y_shuffled[test_start:test_end]
    
    
    # Use the remaining data as training set
    X_train_fold = np.concatenate([X_shuffled[:test_start], X_shuffled[test_end:]])
    y_train_fold = np.concatenate([y_shuffled[:test_start], y_shuffled[test_end:]])
    result = knn(5, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
    # print(result)
    precision_0.append(result[0][0])
    precision_1.append(result[0][1])
    recall_0.append(result[1][0])
    recall_1.append(result[1][1])
    f_score_0.append(result[2][0])
    f_score_1.append(result[2][1])
    support_0.append(result[3][0])
    support_1.append(result[3][1])

print(f"recall_0 -> {(recall_0)}")
print(f"mean recall_0 -> {np.mean(recall_0)}")
print(f"recall_1 -> {(recall_1)}")
print(f"mean recall_1 -> {np.mean(recall_1)}")
print(f"precision_0 -> {(precision_0)}")
print(f"mean precision_0 -> {np.mean(precision_0)}")
print(f"precision_1 -> {(precision_1)}")
print(f"mean precision_1 -> {np.mean(precision_1)}")
print(f"f-score_0 -> {(f_score_0)}")
print(f"mean f-score_0 -> {np.mean(f_score_0)}")
print(f"f-score_1 -> {(f_score_1)}")
print(f"mean f-score_1 -> {np.mean(f_score_1)}")
print(f"support 0 -> {support_0}")
print(f"support 1 -> {support_1}")
    

recall_0 -> [0.6, 0.8947368421052632, 0.8, 0.8333333333333334, 0.7142857142857143, 0.7777777777777778, 0.65, 0.6428571428571429, 0.6428571428571429, 0.7368421052631579]
mean recall_0 -> 0.7292690058479532
recall_1 -> [0.7857142857142857, 0.5, 0.6666666666666666, 0.6470588235294118, 0.4666666666666667, 0.5, 0.6666666666666666, 0.4666666666666667, 0.6666666666666666, 0.6]
mean recall_1 -> 0.596610644257703
precision_0 -> [0.75, 0.7727272727272727, 0.8421052631578947, 0.625, 0.5555555555555556, 0.4117647058823529, 0.8125, 0.5294117647058824, 0.6428571428571429, 0.7777777777777778]
mean precision_0 -> 0.6719699482663879
precision_1 -> [0.6470588235294118, 0.7142857142857143, 0.6, 0.8461538461538461, 0.6363636363636364, 0.8333333333333334, 0.46153846153846156, 0.5833333333333334, 0.6666666666666666, 0.5454545454545454]
mean precision_1 -> 0.6534188360658948
f-score_0 -> [0.6666666666666665, 0.8292682926829269, 0.8205128205128205, 0.7142857142857143, 0.6250000000000001, 0.5384615384615384, 0

## Part 2