In [147]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

In [77]:
cleveland = pd.read_csv("cleveland.csv").replace('?', np.nan).astype(float)
cleveland.dropna(inplace=True)

In [78]:
cleveland.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0


In [79]:
cleveland.num.value_counts()

0.0    160
1.0     54
2.0     35
3.0     35
4.0     13
Name: num, dtype: int64

In [80]:
# use this to make our y column (heart disease presence) 0 or 1
cleveland["num"] = cleveland["num"].apply(lambda x: min(x, 1))

In [81]:
# double check it worked
cleveland.num.value_counts()

0.0    160
1.0    137
Name: num, dtype: int64

In [82]:
# make categorical variables ints to look better before one-hot encoding
cat_vars = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
cleveland[cat_vars] = cleveland[cat_vars].astype(int)
cleveland = pd.get_dummies(cleveland, columns=cat_vars)

In [83]:
cleveland.columns

Index(['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'num', 'sex_0',
       'sex_1', 'cp_1', 'cp_2', 'cp_3', 'cp_4', 'fbs_0', 'fbs_1', 'restecg_0',
       'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'slope_1', 'slope_2',
       'slope_3', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'thal_3', 'thal_6',
       'thal_7'],
      dtype='object')

In [84]:
X = cleveland.drop(columns="num")
y = cleveland.num

In [96]:
neigh = NearestNeighbors(n_neighbors=8)
neigh.fit(X)

In [100]:
neigh.kneighbors([X.iloc[1,].values])[1]



array([[  1,  37, 168, 145,  79, 196, 190, 173]])

In [197]:
class MyKNN(NearestNeighbors):
    def __init__(self, n_neighbors):
        super().__init__(n_neighbors=n_neighbors)
        
    # specific binary prediction using 0.5 as threshold assuming already fit
    def predict(self, X):
        _ , indices = self.kneighbors(X)
        y_preds = []

        # loop through however many observations there are to get preds for each one
        for i in range(len(X)):
            avg_neigh_class = sum(y.values[indices[i]]) / len(y.values[indices[i]])
            pred = int((avg_neigh_class).astype(int).mean() > 0.5)
            y_preds.append(pred)

        return y_preds

    # override fit method so that you can use the y labels for prediction
    def fit(self, X, y):
        self.y = y
        super().fit(X)



In [154]:
# standardizing X
scaler = StandardScaler()
scaler.fit(X)
std_X = scaler.transform(X)

1.2389213165610212e-17

In [179]:
# check out top 5 vars for cov and cor matrices
cov_top = abs(cleveland.cov().loc["num",]).sort_values(ascending=False).head(10)
display(cov_top)

corr_top = abs(cleveland.corr().loc["num",]).sort_values(ascending=False).head(10)
display(corr_top)


thalach     4.855094
chol        2.084550
trestbps    1.361407
age         1.026128
num         0.249340
oldpeak     0.246922
thal_3      0.130574
cp_4        0.126684
ca_0        0.119130
thal_7      0.118084
Name: num, dtype: float64

num        1.000000
thal_3     0.524972
cp_4       0.507035
thal_7     0.484657
ca_0       0.483530
oldpeak    0.424052
thalach    0.423817
exang_0    0.421355
exang_1    0.421355
slope_1    0.380612
Name: num, dtype: float64

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,thalach,chol,trestbps,age,num,oldpeak,thal_3,cp_4,ca_0,thal_7
1,num,thal_3,cp_4,thal_7,ca_0,oldpeak,thalach,exang_0,exang_1,slope_1


In [187]:
# look at what is in the top 10 for both cov and corr as predictors
top_preds = ["thalach", "oldpeak", "thal_3", "cp_4", "ca_0", "thal_7"]

In [213]:
# checking prediction using nearest neighbors with only top_preds

# subset df then scale
X = cleveland[top_preds]
y = cleveland["num"]

scaler = StandardScaler()
scaler.fit(X)
stdzd_X = scaler.transform(X)

test_knn = MyKNN(4)
test_knn.fit(stdzd_X, y)
display(test_knn.predict(stdzd_X[0:10]))
display(y[0:10].values.astype(int))

[0, 0, 1, 0, 0, 0, 1, 0, 1, 1]

  display(y[0:10].values.astype(int))


array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1])

# AI Help
This is only to try and understand how to use the NearestNeighbors class better to create our own knn classifier. The second block is what chatGPT gave when I told it to include GridSearchCV, but this still doesn't seem to solve the problem of feature selection, but that could likely be easily implemented by looping through changing what "X_train" is.

In [89]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define a function to compute the F1 score for a given value of k
def evaluate_k(k):
    # Train a NearestNeighbors model on the training data
    nn = NearestNeighbors(n_neighbors=k)
    nn.fit(X_train)

    # Use the model to predict the labels for the test data
    _, indices = nn.kneighbors(X_test)
    y_pred = (y_train.iloc[indices,].sum(axis=1) > (k / 2)).astype(int)

    # Compute precision, recall, and F1 score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return precision, recall, f1

# Test different values of k and report the results
for k in [1, 3, 5, 7, 9]:
    precision, recall, f1 = evaluate_k(k)
    print(f"k={k}: precision={precision:.2f}, recall={recall:.2f}, f1={f1:.2f}")


ValueError: Wrong number of dimensions. values.ndim > ndim [2 > 1]

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import classification_report
import pandas as pd

# Load data
data = pd.read_csv("cleveland.csv")

# Create feature matrix and target vector
X = data.drop(columns=["num"])
y = data["num"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the range of values for k and the parameters to search
param_grid = {
    'n_neighbors': range(1, 20),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Define the cross-validation method
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Create the model
knn = NearestNeighbors()

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(knn, param_grid=param_grid, cv=cv, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Fit the model with the best parameters
knn = NearestNeighbors(n_neighbors=best_params['n_neighbors'], weights=best_params['weights'], p=best_params['p'])
knn.fit(X_train, y_train)

# Get the predictions on the test set
y_pred = knn.predict(X_test)

# Compute classification report
report = classification_report(y_test, y_pred)
print(report)


# Requirements
- Standardize data to prevent inequally-weighted distance because of scale
- Iterate over # of k neighbors to fit NearestNeighbors object on
- Iterate over all possible combos of features to fit NearestNeighbors object on
- for each model with the specific neighbor and parameter combo:
    - find y_pred
    - if f1 score of model is greater than best_f1, update best_k, best_cols
    - use GridSearchCV to implement this with cross validation?
        - would have to implement predict method and maybe more