In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

In [70]:
cleveland = pd.read_csv("cleveland.csv").replace('?', np.nan).astype(float)
cleveland.dropna(inplace=True)

In [71]:
cleveland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


In [72]:
cleveland.num.value_counts()

0.0    160
1.0     54
2.0     35
3.0     35
4.0     13
Name: num, dtype: int64

In [73]:
# use this to make our y column (heart disease presence) 0 or 1
cleveland["num"] = cleveland["num"].apply(lambda x: min(x, 1))

In [74]:
# double check it worked
cleveland.num.value_counts()

0    160
1    137
Name: num, dtype: int64

In [48]:
# make categorical variables ints to look better before one-hot encoding
cat_vars = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
cleveland[cat_vars] = cleveland[cat_vars].astype(int)
cleveland = pd.get_dummies(cleveland, columns=cat_vars)

In [49]:
cleveland.columns

Index(['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'num', 'sex_0',
       'sex_1', 'cp_1', 'cp_2', 'cp_3', 'cp_4', 'fbs_0', 'fbs_1', 'restecg_0',
       'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'slope_1', 'slope_2',
       'slope_3', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'thal_3', 'thal_6',
       'thal_7'],
      dtype='object')

# ChatGPT Help
This is only to try and understand how to use the NearestNeighbors class better to create our own knn classifier. The second block is what chatGPT gave when I told it to include GridSearchCV, but this still doesn't seem to solve the problem of feature selection, but that could likely be easily implemented by looping through changing what "X_train" is.

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the Cleveland dataset and preprocess the "num" column to be binary
data = pd.read_csv("cleveland.csv")
data["num"] = (data["num"] > 0).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=["num"]), data["num"], test_size=0.2, random_state=42
)

# Define a function to compute the F1 score for a given value of k
def evaluate_k(k):
    # Train a NearestNeighbors model on the training data
    nn = NearestNeighbors(n_neighbors=k)
    nn.fit(X_train)

    # Use the model to predict the labels for the test data
    _, indices = nn.kneighbors(X_test)
    y_pred = (y_train.iloc[indices].sum(axis=1) > (k / 2)).astype(int)

    # Compute precision, recall, and F1 score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return precision, recall, f1

# Test different values of k and report the results
for k in [1, 3, 5, 7, 9]:
    precision, recall, f1 = evaluate_k(k)
    print(f"k={k}: precision={precision:.2f}, recall={recall:.2f}, f1={f1:.2f}")


In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# Load data
data = pd.read_csv("cleveland.csv")

# Create feature matrix and target vector
X = data.drop(columns=["num"])
y = data["num"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the range of values for k and the parameters to search
param_grid = {
    'n_neighbors': range(1, 20),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Define the cross-validation method
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Create the model
knn = NearestNeighbors()

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(knn, param_grid=param_grid, cv=cv, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Fit the model with the best parameters
knn = NearestNeighbors(n_neighbors=best_params['n_neighbors'], weights=best_params['weights'], p=best_params['p'])
knn.fit(X_train, y_train)

# Get the predictions on the test set
y_pred = knn.predict(X_test)

# Compute classification report
report = classification_report(y_test, y_pred)
print(report)
