In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import DistanceMetric
import warnings

In [14]:
# Import the datasets
pp = pd.read_csv('data/Project Plot Points.csv',delimiter=';')
cp = pd.read_csv('data/Control Plot Points.csv',delimiter=';')

In [None]:
# Drop X and Y (confidential info, can't publictlt shared)
pp.drop(['X', 'Y'], axis=1, inplace=True)
cp.drop(['X', 'Y'], axis=1, inplace=True)

In [25]:
print(pp.columns)
print(cp.columns)

Index(['PP ID', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', '2024', 'Euclidean Distance'],
      dtype='object')
Index(['CP ID', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', '2024', 'Euclidean Distance', 'Slope'],
      dtype='object')


In [26]:
display(cp.head(), pp.head())

Unnamed: 0,CP ID,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,Euclidean Distance,Slope
0,49830,0.1289,0.0695,0.1128,0.2049,0.2838,0.3574,0.4418,0.4829,0.519,0.5438,25031.207,0.0577
1,49829,0.0924,0.0762,0.1537,0.2143,0.2982,0.3917,0.4649,0.5011,0.5141,0.5161,25040.3633,0.0573
2,49749,0.0639,0.0683,0.1097,0.1513,0.1995,0.2807,0.3669,0.4154,0.4714,0.4905,24974.9961,0.054
3,49908,0.1097,0.0933,0.1106,0.2302,0.3026,0.3634,0.4214,0.4507,0.4752,0.4824,25096.7031,0.0507
4,49750,0.0555,0.0693,0.0872,0.1391,0.1888,0.2621,0.3436,0.3914,0.4375,0.4446,24966.0,0.0502


Unnamed: 0,PP ID,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,Euclidean Distance
0,18482,0.2823,0.2644,0.2645,0.2848,0.2947,0.2964,0.3194,0.306,0.3162,0.3134,6058.0869
1,39716,0.2429,0.2958,0.3218,0.3063,0.2996,0.3215,0.3507,0.3281,0.3195,0.2899,9544.7607
2,27158,0.3623,0.3631,0.3977,0.405,0.3964,0.4171,0.4505,0.4012,0.4051,0.407,5702.917
3,39583,0.2815,0.2844,0.3425,0.31,0.2451,0.3462,0.3781,0.3883,0.4212,0.4259,5409.5381
4,25567,0.3042,0.3017,0.3583,0.3687,0.3549,0.3888,0.3858,0.4036,0.3738,0.369,6769.5317


In [17]:
# Celan NaN in cp
cp.dropna(inplace=True)

In [None]:
def find_knn_for_labels(
    df_labeled: pd.DataFrame, 
    df_unlabeled: pd.DataFrame, 
    historical_si_cols: list[str], 
    k: int = 3, 
    id_col_labeled: str = 'PP ID', 
    id_col_unlabeled: str = 'CP ID', 
    metric: str = 'mahalanobis',
    use_scaler: bool = False # Parameter to control StandardScaler
) -> pd.DataFrame:
    """
    Finds the k-nearest neighbors for each labeled data point (project plot)
    from a pool of unlabeled data points (control plots) using a specified distance metric.

    This function is designed to support matching project plots with control plots
    based on historical Stocking Index (SI) values, as required by methodologies like VM0047.

    Args:
        df_labeled (pd.DataFrame): DataFrame containing project plots.
                                   Must include an ID column and historical SI columns.
        df_unlabeled (pd.DataFrame): DataFrame containing potential control plots.
                                     Must include an ID column and historical SI columns.
        historical_si_cols (list[str]): A list of column names representing the historical
                                        SI values (e.g., ['2015', '2016', ...]).
        k (int): The number of nearest neighbors to find for each labeled data point.
                 Defaults to 3.
        id_col_labeled (str): The name of the column containing unique IDs for labeled data.
                              Defaults to 'PP ID'.
        id_col_unlabeled (str): The name of the column containing unique IDs for unlabeled data.
                                Defaults to 'CP ID'.
        metric (str): The distance metric to use for finding neighbors. Supported options:
                      'euclidean', 'mahalanobis'. Defaults to 'mahalanobis'.
        use_scaler (bool): Whether to apply StandardScaler to the features. Defaults to True.
                           Set to False if your SI features are already well-scaled and normalized.

    Returns:
        pd.DataFrame: A DataFrame where each row represents a match between a project plot
                      (identified by 'Matched_PP_ID') and its nearest control plot(s),
                      including the calculated distance ('distance_metric_value').
                      Returns an empty DataFrame if no matches are found.

    Raises:
        KeyError: If a specified ID column or SI column is not found in the DataFrames.
        ValueError: If an unsupported distance metric is provided or if Mahalanobis metric
                    cannot be computed due to singular covariance matrix (and fails to recover).
    """
    # --- 1. Input Data Validation ---
    for col in historical_si_cols:
        if col not in df_labeled.columns:
            raise KeyError(f"Column '{col}' not found in labeled data (project plots).")
        if col not in df_unlabeled.columns:
            raise KeyError(f"Column '{col}' not found in unlabeled data (control plots).")
    if id_col_labeled not in df_labeled.columns:
        raise KeyError(f"ID column '{id_col_labeled}' not found in labeled data.")
    if id_col_unlabeled not in df_unlabeled.columns:
        raise KeyError(f"ID column '{id_col_unlabeled}' not found in unlabeled data.")
        
    if df_labeled.empty or df_unlabeled.empty:
        print("Warning: One or both input DataFrames are empty. Returning empty DataFrame.")
        return pd.DataFrame()

    # --- 2. Feature Extraction and Optional Scaling ---
    X_labeled = df_labeled[historical_si_cols].values
    X_unlabeled = df_unlabeled[historical_si_cols].values

    X_unlabeled_scaled = X_unlabeled # Default to original data
    X_labeled_scaled = X_labeled   # Default to original data
    
    if use_scaler:
        print("Applying StandardScaler...")
        scaler = StandardScaler()
        X_unlabeled_scaled = scaler.fit_transform(X_unlabeled)
        X_labeled_scaled = scaler.transform(X_labeled) # Use the same scaler
    else:
        print("Skipping StandardScaler as requested.")

    # --- 3. Initialize and Fit NearestNeighbors Model ---
    knn = None
    
    if metric == 'mahalanobis':
        try:
            # Calculate covariance matrix on the (potentially scaled) unlabeled data
            cov_matrix = np.cov(X_unlabeled_scaled, rowvar=False)
            
            if np.linalg.det(cov_matrix) == 0:
                print("Warning: Covariance matrix is singular. Adding regularization.")
                cov_matrix += np.eye(cov_matrix.shape[0]) * 1e-6 
            
            knn = NearestNeighbors(n_neighbors=k, algorithm='auto', metric='mahalanobis', metric_params={'VI': cov_matrix})
            print("Mahalanobis distance configured successfully.")
        except Exception as e:
            print(f"Error configuring Mahalanobis distance: {e}. Falling back to Euclidean distance.")
            metric = 'euclidean' # Fallback

    if metric != 'mahalanobis' or knn is None:
        if metric == 'mahalanobis': # This block is for the fallback case
            print("Using Euclidean distance as fallback.")
        else:
            print(f"Using specified metric: {metric}")
        knn = NearestNeighbors(n_neighbors=k, metric=metric)

    knn.fit(X_unlabeled_scaled)

    # --- 4. Find Neighbors for Labeled Data ---
    distances, indices = knn.kneighbors(X_labeled_scaled)

    # --- 5. Construct Results DataFrame ---
    results = []
    for i in range(len(df_labeled)):
        label_id = df_labeled.iloc[i][id_col_labeled]
        neighbor_indices = indices[i]
        
        nearest_neighbors_data = df_unlabeled.iloc[neighbor_indices].copy()
        
        nearest_neighbors_data['Matched_PP_ID'] = label_id
        nearest_neighbors_data['distance_metric_value'] = distances[i]
        
        results.append(nearest_neighbors_data)

    if not results:
        return pd.DataFrame()

    matched_df = pd.concat(results).reset_index(drop=True)
    
    return matched_df

In [49]:
# Finding 3 nearest neighbors for each labeled data point
knn_results = find_knn_for_labels(
    pp, 
    cp,
    historical_si_cols=['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', '2024'], 
    id_col_labeled='PP ID',
    id_col_unlabeled='CP ID',
    k=1,
    metric='mahalanobis')

# Displaying the results
# for label, neighbors in knn_results.items():
#     print(f"Label: {label}")
#     print(neighbors)
#     print()

Skipping StandardScaler as requested.
Mahalanobis distance configured successfully.


In [50]:
knn_results

Unnamed: 0,CP ID,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,Euclidean Distance,Slope,Matched_PP_ID,distance_metric_value
0,196412,0.2793,0.2654,0.2686,0.288,0.2847,0.2997,0.311,0.3134,0.316,0.3153,19578.5879,0.006,18482.0,0.000303
1,87663,0.2606,0.2878,0.3165,0.3046,0.2961,0.3319,0.3387,0.3349,0.3072,0.3029,39842.6875,0.0045,39716.0,0.000618
2,387858,0.3707,0.3567,0.3895,0.3975,0.405,0.4231,0.4397,0.4257,0.4044,0.3966,9296.6475,0.0054,27158.0,0.000674
3,301937,0.2779,0.285,0.3043,0.2882,0.3013,0.3384,0.4005,0.3784,0.4489,0.4011,72657.4922,0.0182,39583.0,0.001658
4,106456,0.3149,0.2958,0.3696,0.357,0.3547,0.3765,0.4011,0.3894,0.3763,0.375,54961.6992,0.0082,25567.0,0.000552
5,169013,0.1947,0.206,0.2559,0.2939,0.2805,0.2828,0.2908,0.3056,0.2764,0.2633,4230.8125,0.0082,7052.0,0.000859
6,67085,0.2538,0.2915,0.3193,0.3214,0.3287,0.3399,0.3526,0.3424,0.3208,0.3035,75770.8203,0.0053,19943.0,0.000883
7,94520,0.2177,0.2455,0.2536,0.2502,0.2509,0.2627,0.2715,0.2641,0.2594,0.254,51506.707,0.0034,5324.0,0.000491
8,278719,0.343,0.359,0.3713,0.3658,0.3742,0.3783,0.362,0.3848,0.4011,0.3946,913.2623,0.005,41670.0,0.000682
9,310630,0.2949,0.2734,0.3126,0.3145,0.3509,0.3114,0.315,0.373,0.3484,0.347,75410.8125,0.0076,23086.0,0.000659
