In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import os

# --- Methodology: Inverse Distance Weighting (IDW) ---
# Why IDW?
# IDW is a very common and effective method for spatial interpolation.
# It assumes that points closer to the prediction location are more influential.
# The weight of each known point is the inverse of its distance to the power of 'p'.
# A common value for 'p' (the power parameter) is 2.
def inverse_distance_weighting(x, y, values, xi, yi, p=2):
    """
    Performs Inverse Distance Weighting (IDW) interpolation.
    x, y: Coordinates of the known points.
    values: Values at the known points.
    xi, yi: Coordinates of the point to predict.
    p: Power parameter.
    """
    distances = np.sqrt((x - xi)**2 + (y - yi)**2)

    # If a prediction point is exactly at a known point, return the known value
    if np.any(distances == 0):
        return values[np.where(distances == 0)[0][0]]

    weights = 1.0 / (distances**p)
    weighted_sum = np.sum(weights * values)
    sum_of_weights = np.sum(weights)

    return weighted_sum / sum_of_weights

# Load the dataset from the uploaded CSV file
try:
    df = pd.read_csv('limestone_layers_phase_1.csv')
except FileNotFoundError:
    print("Error: The file 'limestone_layers_phase_1.csv' was not found.")
    # As a fallback for demonstration, create a dummy dataframe
    data = {'pile_number': range(40),
            'northing_coord_y': np.random.uniform(2000, 2100, 40),
            'easting_coord_x': np.random.uniform(7600, 7700, 40),
            'limestone_thickness': [12, 24, 18, np.nan, np.nan, np.nan, np.nan, 15, 10, np.nan, 12, 14, np.nan, np.nan, 20, 15, 18, 16, np.nan, np.nan, 10, 12, np.nan, 15, 13, 22, np.nan, np.nan, 15, 10, 15, np.nan, np.nan, np.nan, np.nan, np.nan, 37, np.nan, np.nan, 15],
            'sounding_beg_limestone': [27, 19, 21, np.nan, np.nan, np.nan, np.nan, 35, 40, np.nan, 28, 26, np.nan, np.nan, 25, 30, 28, 29, np.nan, np.nan, 38, 35, np.nan, 32, 34, 24, np.nan, np.nan, 28, 42, 28, np.nan, np.nan, np.nan, np.nan, np.nan, 10, np.nan, np.nan, 30]}
    df = pd.DataFrame(data)

# --- Data Cleaning and Preparation ---
for col in ['northing_coord_y', 'easting_coord_x']:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace(',', '').astype(float)

df_known = df.dropna(subset=['limestone_thickness', 'sounding_beg_limestone'])
df_unknown = df[df['limestone_thickness'].isnull() | df['sounding_beg_limestone'].isnull()]

features = ['northing_coord_y', 'easting_coord_x']
X_known = df_known[features]
y_thickness = df_known['limestone_thickness']
y_sounding = df_known['sounding_beg_limestone']
X_unknown = df_unknown[features]

# --- Applying IDW for Approximation ---
predicted_thickness = []
predicted_sounding = []

# Known data points for the IDW function
x_known_coords = X_known['easting_coord_x'].values
y_known_coords = X_known['northing_coord_y'].values
thickness_values = y_thickness.values
sounding_values = y_sounding.values

# Loop through each unknown point and apply IDW
for index, row in X_unknown.iterrows():
    xi, yi = row['easting_coord_x'], row['northing_coord_y']
    pred_thick = inverse_distance_weighting(x_known_coords, y_known_coords, thickness_values, xi, yi)
    pred_sound = inverse_distance_weighting(x_known_coords, y_known_coords, sounding_values, xi, yi)
    predicted_thickness.append(pred_thick)
    predicted_sounding.append(pred_sound)

# --- Populating the DataFrame with Approximations ---
df_results = df.copy()
df_results.loc[df_unknown.index, 'limestone_thickness'] = np.round(predicted_thickness, 2)
df_results.loc[df_unknown.index, 'sounding_beg_limestone'] = np.round(predicted_sounding, 2)
df_results['Approximation_Flag'] = df['limestone_thickness'].isnull() | df['sounding_beg_limestone'].isnull()

# --- Cross-Validation (using Random Forest as before) ---
model_rf_thickness = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf_sounding = RandomForestRegressor(n_estimators=100, random_state=42)

cv_folds = min(5, len(df_known))
scores_thickness = cross_val_score(model_rf_thickness, X_known, y_thickness, cv=cv_folds, scoring='neg_mean_absolute_error')
scores_sounding = cross_val_score(model_rf_sounding, X_known, y_sounding, cv=cv_folds, scoring='neg_mean_absolute_error')

# --- Displaying the Results ---
print("--- Geotechnical Analysis Results (using IDW) ---")
print("\nTable of Approximated Limestone Properties:")
display_cols = ['pile_number', 'northing_coord_y', 'easting_coord_x', 'limestone_thickness', 'sounding_beg_limestone', 'Approximation_Flag']
print(df_results[display_cols].to_string())

print("\n--- Cross-Validation Results ---")
print(f"Limestone Thickness - Mean Absolute Error (MAE): {-np.mean(scores_thickness):.2f} meters")
print(f"Limestone Thickness - Standard Deviation of Error: {np.std(scores_thickness):.2f} meters")
print("\nThis means that, on average, our thickness predictions are likely to be off by about " f"{-np.mean(scores_thickness):.2f} meters.")

print(f"\nSounding Beg Limestone - Mean Absolute Error (MAE): {-np.mean(scores_sounding):.2f} meters")
print(f"Sounding Beg Limestone - Standard Deviation of Error: {np.std(scores_sounding):.2f} meters")
print("\nThis means that, on average, our predictions for where the limestone begins are likely to be off by about " f"{-np.mean(scores_sounding):.2f} meters.")

Error: The file 'limestone_layers_phase_1.csv' was not found.
--- Geotechnical Analysis Results (using IDW) ---

Table of Approximated Limestone Properties:
    pile_number  northing_coord_y  easting_coord_x  limestone_thickness  sounding_beg_limestone  Approximation_Flag
0             0       2048.031756      7655.015563                12.00                   27.00               False
1             1       2078.639994      7652.329711                24.00                   19.00               False
2             2       2006.097102      7653.697460                18.00                   21.00               False
3             3       2037.349979      7633.949227                15.64                   28.50                True
4             4       2083.971146      7614.388085                15.82                   28.61                True
5             5       2024.646293      7602.842254                15.32                   27.91                True
6             6       2099.8754

In [None]:
# --- Save results to CSV ---
import os

out_dir = "output"
os.makedirs(out_dir, exist_ok=True)

# Choose a non-conflicting filename
out_path = os.path.join(out_dir, "limestone_idw_notebook_output.csv")

# Save the full results DataFrame with predictions and flags
df_results.to_csv(out_path, index=False)

print(f"Saved results to: {out_path}")

Saved results to: output/limestone_idw_notebook_output.csv
