In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset from the uploaded CSV file
try:
    df = pd.read_csv('limestone_layers_phase_1.csv')
except FileNotFoundError:
    print("Error: The file 'limestone_layers_phase_1.csv' was not found.")
    # As a fallback for demonstration, create a dummy dataframe
    data = {'pile_number': range(40),
            'northing_coord_y': np.random.uniform(2000, 2100, 40),
            'easting_coord_x': np.random.uniform(7600, 7700, 40),
            'limestone_thickness': [12, 24, 18, np.nan, np.nan, np.nan, np.nan, 15, 10, np.nan, 12, 14, np.nan, np.nan, 20, 15, 18, 16, np.nan, np.nan, 10, 12, np.nan, 15, 13, 22, np.nan, np.nan, 15, 10, 15, np.nan, np.nan, np.nan, np.nan, np.nan, 37, np.nan, np.nan, 15],
            'sounding_beg_limestone': [27, 19, 21, np.nan, np.nan, np.nan, np.nan, 35, 40, np.nan, 28, 26, np.nan, np.nan, 25, 30, 28, 29, np.nan, np.nan, 38, 35, np.nan, 32, 34, 24, np.nan, np.nan, 28, 42, 28, np.nan, np.nan, np.nan, np.nan, np.nan, 10, np.nan, np.nan, 30]}
    df = pd.DataFrame(data)

# --- Data Cleaning and Preparation ---
# The coordinates have commas, which need to be removed for conversion to numeric types.
for col in ['northing_coord_y', 'easting_coord_x']:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace(',', '').astype(float)

# Separate the data into two groups:
# 1. Data with known limestone values (our training data)
# 2. Data with unknown limestone values (what we want to predict)
df_known = df.dropna(subset=['limestone_thickness', 'sounding_beg_limestone'])
df_unknown = df[df['limestone_thickness'].isnull() | df['sounding_beg_limestone'].isnull()]

# Define our features (X) and what we want to predict (y)
features = ['northing_coord_y', 'easting_coord_x']
X_known = df_known[features]
y_thickness = df_known['limestone_thickness']
y_sounding = df_known['sounding_beg_limestone']

X_unknown = df_unknown[features]


# --- Methodology: K-Nearest Neighbors (KNN) for Approximation ---
# Why KNN?
# Geotechnical properties, like soil layers, are often spatially correlated.
# This means that piles closer to each other are more likely to have similar limestone depths and thicknesses.
# KNN is an excellent choice for this kind of spatial interpolation. It predicts the value for a new point
# by looking at the average value of its 'k' nearest neighbors. It's a simple, intuitive, and effective
# method for this type of problem.

# We will create two separate KNN models: one for thickness and one for the start of the limestone.
# A pipeline is used to scale the features, which is good practice for distance-based algorithms like KNN.
pipeline_thickness = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=3)) # Using 3 nearest neighbors is a reasonable starting point
])

pipeline_sounding = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=3))
])

# Train the models
pipeline_thickness.fit(X_known, y_thickness)
pipeline_sounding.fit(X_known, y_sounding)

# Make predictions on the unknown data
predicted_thickness = pipeline_thickness.predict(X_unknown)
predicted_sounding = pipeline_sounding.predict(X_unknown)

# --- Populating the DataFrame with Approximations ---
# Now we'll fill in the missing values in our original dataframe.
df_results = df.copy()
df_results.loc[df_unknown.index, 'limestone_thickness'] = np.round(predicted_thickness, 2)
df_results.loc[df_unknown.index, 'sounding_beg_limestone'] = np.round(predicted_sounding, 2)
df_results['Approximation_Flag'] = df['limestone_thickness'].isnull() | df['sounding_beg_limestone'].isnull()


# --- Cross-Validation: Evaluating the Model's Performance ---
# How confident can we be in these approximations? Cross-validation gives us an answer.
# We will use a more robust model, Random Forest, to evaluate the potential accuracy.
# Why Random Forest for validation?
# Random Forest is a powerful model that can capture more complex relationships in the data.
# By training it on different subsets of our *known* data and seeing how well it predicts the held-out data,
# we can get a good estimate of how accurate our spatial interpolation is likely to be.
# We'll use the Mean Absolute Error (MAE) as our metric. MAE tells us, on average, how many meters
# our predictions are off by.

# We don't need to scale features for tree-based models like Random Forest.
model_rf_thickness = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf_sounding = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation
# Note: cv=5 might be high if you have very few known points. If so, reduce it.
cv_folds = min(5, len(df_known))

scores_thickness = cross_val_score(model_rf_thickness, X_known, y_thickness, cv=cv_folds, scoring='neg_mean_absolute_error')
scores_sounding = cross_val_score(model_rf_sounding, X_known, y_sounding, cv=cv_folds, scoring='neg_mean_absolute_error')

# --- Displaying the Results ---
print("--- Geotechnical Analysis Results ---")
print("\nTable of Approximated Limestone Properties:")
# Displaying only the relevant columns for clarity
display_cols = ['pile_number', 'northing_coord_y', 'easting_coord_x', 'limestone_thickness', 'sounding_beg_limestone', 'Approximation_Flag']
print(df_results[display_cols].to_string())

print("\n--- Cross-Validation Results ---")
print(f"Limestone Thickness - Mean Absolute Error (MAE): {-np.mean(scores_thickness):.2f} meters")
print(f"Limestone Thickness - Standard Deviation of Error: {np.std(scores_thickness):.2f} meters")
print("\nThis means that, on average, our thickness predictions are likely to be off by about " f"{-np.mean(scores_thickness):.2f} meters.")

print(f"\nSounding Beg Limestone - Mean Absolute Error (MAE): {-np.mean(scores_sounding):.2f} meters")
print(f"Sounding Beg Limestone - Standard Deviation of Error: {np.std(scores_sounding):.2f} meters")
print("\nThis means that, on average, our predictions for where the limestone begins are likely to be off by about " f"{-np.mean(scores_sounding):.2f} meters.")

--- Geotechnical Analysis Results ---

Table of Approximated Limestone Properties:
    pile_number  northing_coord_y  easting_coord_x  limestone_thickness  sounding_beg_limestone  Approximation_Flag
0             1          2057.607         7623.010                12.00                   27.00               False
1             2          2054.639         7622.551                24.00                   19.00               False
2             3          2051.677         7622.093                18.00                   21.00               False
3             4          2046.884         7621.351                22.00                   20.33                True
4             5          2043.922         7620.893                22.00                   20.33                True
5             6          2040.954         7620.434                22.00                   20.33                True
6             7          2058.231         7627.906                22.67                   22.33          