<a href="https://colab.research.google.com/github/skovz99/Zack-Skovgaard/blob/main/Minimum_Neighbors_kNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

# training data
data = [(13, 14, 1), (11, 8, 1), (4, 7, 0), (5, 8, 0), (10, 1, 1)]
data_df = pd.DataFrame(data, columns=["X", "Y", "Classification"])
length = len(data_df)

# testing data
new_data = [(3, 4, 0), (11, 8, 1), (5, 17, 1), (15, 8, 1), (10, 11, 1)]
new_data_df = pd.DataFrame(new_data, columns=["X", "Y", "Classification"])

# Return the classification values as a numpy array
classification = data_df['Classification'].to_numpy()

# zip the X and Y columns together
zipped = [list(i) for i in zip(data_df['X'], data_df['Y'])]

# find the nearest neighboring points to each of the X,Y coordinates in the data_df
nbrs = NearestNeighbors(n_neighbors=length, algorithm='ball_tree').fit(zipped)
distances, indices = nbrs.kneighbors(zipped)

# Get the classification values of the nearest neighbors
nearest_neighbors_classification = classification[indices]

# Calculate the majority vote for each subset of neighbors
majority_votes = []
num_columns = nearest_neighbors_classification.shape[1]

for x in range(1, num_columns + 1):
    subset_neighbors = nearest_neighbors_classification[:, :x]
    majority_vote = pd.DataFrame(subset_neighbors).mode(axis=1)[0]
    majority_votes.append(majority_vote)

# Concatenate the majority votes along the columns to create a new DataFrame
majority_votes_df = pd.concat(majority_votes, axis=1)

# Rename the columns in the majority_votes_df
new_columns = [f'Majority Vote {x}' for x in range(num_columns)]
majority_votes_df.columns = new_columns
majority_votes_df = majority_votes_df.drop(['Majority Vote 0'], axis=1)

minimum_nbrs = []
# Iterate through the rows of the majority_votes_df
for index, row in majority_votes_df.iterrows():
    # Check if the majority vote row matches the correct classification
    if (row[:-1] == data_df.at[index, 'Classification']).any():
        # Get the index of the first correct classification
        first_correct_idx = (row[:-1] == data_df.at[index, 'Classification']).idxmax()
        # Get the minimum number of majority votes for this column
        minimum_nbrs.append(int(first_correct_idx.split()[-1]))
    else:
        # If no correct classification found in this row, set it to -1
        minimum_nbrs.append(-1)

data_df['Minimum Neighbors'] = minimum_nbrs
minimum_for_testing = data_df['Minimum Neighbors'].to_numpy()

# Adding each pair from new_data_df to data_df as individual DataFrames
resulting_dfs = []
for idx, pair in enumerate(new_data):
    new_df = pd.concat([data_df, pd.DataFrame([pair], columns=["X", "Y", "Classification"])], ignore_index=True)
    resulting_dfs.append(new_df)

# determine the minimum nearest neighbors to assign to the testing data
ultra = []
for new_df in resulting_dfs:
    zipped_testing = [list(i) for i in zip(new_df['X'], new_df['Y'])]  # Use the new DataFrame
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(zipped_testing)  # Use zipped_testing
    distances, indices = nbrs.kneighbors(zipped_testing)
    ultra_df = pd.DataFrame(indices)
    ultra.append(ultra_df)

minimum_neighbors_test = []
for ult in ultra:
  ult['Minimum Neighbors'] = data_df['Minimum Neighbors']
  finding = ult.iloc[-1, 1]
  minimum_neighbors_test.append(ult['Minimum Neighbors'][finding])

new_data_df['Min Neighbors'] = minimum_neighbors_test

# Extract the features (X, Y) and labels (Class) from the training data
X_train = data_df[['X', 'Y']]
y_train = data_df['Classification']

# Loop through each testing data point and determine the classification
predictions = []
for index, test_point in new_data_df.iterrows():
  X_test = test_point[['X', 'Y']].values.reshape(1, -1)
  k_neighbors = int(test_point['Min Neighbors'])
  knn = KNeighborsClassifier(n_neighbors=k_neighbors)
  knn.fit(X_train, y_train)
  predicted_class = knn.predict(X_test)[0]
  predictions.append(predicted_class)

new_data_df['Predictions'] = predictions

# Comparison of Classification of the testing dataset to the predictions on the testing dataset
new_data_df['Accuracy'] = np.where(new_data_df['Classification'] == new_data_df['Predictions'], 1, 0)
new_data_df['True Positive Logic'] = np.where(new_data_df['Classification'] == 1, new_data_df['Predictions'], np.nan)
True_P = np.nanmean(new_data_df['True Positive Logic']) * 100
new_data_df['True Negative Logic'] = np.where(new_data_df['Classification'] == 0, new_data_df['Predictions'], np.nan)
True_N = ((new_data_df['True Negative Logic'] == 0).sum() / (new_data_df['Classification'] == 0).sum()) * 100
new_data_df['False Negative Logic'] = np.where((new_data_df['Classification'] == 1) & (new_data_df['Predictions'] == 0), 1, np.nan)
False_N = ((new_data_df['False Negative Logic'] == 1).sum() / (new_data_df['Classification'] == 1).sum()) * 100
new_data_df['False Positive Logic'] = np.where((new_data_df['Classification'] == 0) & (new_data_df['Predictions'] == 1), 1, np.nan)
False_P = ((new_data_df['False Positive Logic'] == 1).sum() / (new_data_df['Classification'] == 0).sum()) * 100
new_data_df