In [4]:
from numba import njit
import pandas as pd
from pandas.core.interchange.dataframe_protocol import DataFrame
import numpy as np
import scipy.optimize._minimize as minimize

In [29]:
def bin_search(arr, func, goal, epsilon):
    start, end = 0,len(arr)-1
    for i in range(end+1):
        middle = (end - start)/2
        if goal-epsilon <= func(arr[middle]) <= goal+epsilon:
            return middle
        elif func(arr[middle]) < goal-epsilon:
            start = middle+1
        else:
            end = middle-1

In [1]:
def find_sigma_i(x_i, x_is, p_i, k, A):
    sig_i = np.linspace(0, 10, 100)
    goal = np.log2(k)
    def sig_exp(x_i, x_is, p_i, sig):
        res = 0
        for i in range(k):
            res += np.exp(-(np.linalg.norm(A[x_i]-A[x_is[i]])-p_i)/sig)
        return res
    sig = sig_exp(x_i, x_is, p_i, sig_i)
    return sig
    

In [2]:
import numpy as np

def construct_knn(A: np.ndarray, k=3):
    """
    Constructs a k-NN matrix with distances.

    Parameters:
    - A (np.ndarray): Input data of shape (n_samples, n_features).
    - k (int): Number of nearest neighbors to find.

    Returns:
    - knn_indices (np.ndarray): k-NN indices matrix of shape (n_samples, k).
    - knn_dist_min (np.ndarray): Minimum nonzero distance for each sample.
    """
    n_samples = A.shape[0]
    knn_indices = np.zeros((n_samples, k), dtype=int)
    knn_dist_min = np.zeros((n_samples, 1), dtype=float)  # Should store float distances

    for i in range(n_samples):
        # Compute Euclidean distances
        distances = np.linalg.norm(A - A[i], axis=1)
        # Sort indices by distance, excluding the first (self)
        sorted_indices = np.argsort(distances)
        knn_indices[i] = sorted_indices[1:k+1]  # Skip self (index 0)
        knn_dist_min[i] = np.min(distances[sorted_indices[1:]])  # Min nonzero distance

    return knn_indices, knn_dist_min

# Example usage
if __name__ == "__main__":
    np.random.seed(42)
    A = np.random.rand(10, 5)  # 10 samples, 5 features
    knn_indices, knn_dist_min = construct_knn(A, k=3)
    
    print("K-NN Indices:\n", knn_indices)
    print("\nMin Nonzero Distances:\n", knn_dist_min)


K-NN Indices:
 [[2 9 3]
 [3 4 9]
 [0 3 9]
 [9 4 1]
 [9 7 3]
 [9 4 7]
 [7 4 8]
 [4 9 5]
 [3 7 9]
 [5 4 3]]

Min Nonzero Distances:
 [[0.53446386]
 [0.61667636]
 [0.53446386]
 [0.50410142]
 [0.43618648]
 [0.22107662]
 [0.63683244]
 [0.45277956]
 [0.71387867]
 [0.22107662]]


In [7]:
#input in as a pandas dataframe
def umap(data: pd.DataFrame):
    data = data.copy().to_numpy()
    knn_mat, knn_dist_min = construct_knn(data)
    return knn_mat

In [8]:
df = pd.read_csv("diabetes.csv")
data = pd.DataFrame({
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
    'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ],
    'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
})
print(umap(data))

[[ 1  2  3]
 [ 0  2  3]
 [ 1  3  0]
 [ 2  4  1]
 [ 3  5  2]
 [ 4  6  3]
 [ 5  7  4]
 [ 6  8  5]
 [ 7  9  6]
 [ 8 10  7]
 [ 9 11  8]
 [10 12  9]
 [11 13 10]
 [12 14 11]
 [13 15 12]
 [14 13 12]]


In [9]:
import numpy as np
from collections import Counter

In [10]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))
