In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils import transform_crs

In [2]:
PREDICTION_INCREMENT_FT = 2.5
MIN_THICKNESS_FOR_INTERNAL_POINTS_FT = PREDICTION_INCREMENT_FT

In [3]:
borehole_df = pd.read_csv('../../Documents/borehole_data/subsurface_layer_data_joined.csv')

  borehole_df = pd.read_csv('../../Documents/borehole_data/subsurface_layer_data_joined.csv')


In [4]:
lat_col = 'EPSG6599_LATITUDE'
long_col = 'EPSG6599_LONGITUDE'
depth_col = 'DEPTH_FT'
lithology_col = 'SYMBOL_LITHOLOGY'
grouped_lithology_col = 'Grouped_Lithology'

feature_cols = [lat_col, long_col, depth_col]
target_col = grouped_lithology_col

In [5]:
initial_rows = borehole_df.shape[0]
borehole_df.dropna(subset=['LATITUDE', 'LONGITUDE', 'TOP_DEPTH_FT', 'BOTTOM_DEPTH_FT', lithology_col], inplace=True)
rows_after_nan_drop = borehole_df.shape[0]
if initial_rows > rows_after_nan_drop:
        print(f"Dropped {initial_rows - rows_after_nan_drop} rows with missing essential data.")

Dropped 70801 rows with missing essential data.


In [6]:
output_csv_path = '../data/boundary_2p5_interval_ungrouped_lithology_dem_training_data.csv'

In [7]:
crs_projected_df = transform_crs(
    borehole_df,
    source_crs='EPSG:4326',
    target_crs='EPSG:6599',
    )


In [8]:
processed_training_data = []
processed_data_columns = ['EPSG6599_LATITUDE', 'EPSG6599_LONGITUDE', 'DEPTH_FT', 'SYMBOL_LITHOLOGY']
for _, original_row in crs_projected_df.iterrows():
    new_rows = []
    top_depth_ft = original_row['TOP_DEPTH_FT']
    bottom_depth_ft = original_row['BOTTOM_DEPTH_FT']
    top_depth_row = [
        original_row['EPSG6599_LATITUDE'],
        original_row['EPSG6599_LONGITUDE'],
        original_row['TOP_DEPTH_FT'], 
        original_row['SYMBOL_LITHOLOGY']
        ]
    bottom_depth_row = [
        original_row['EPSG6599_LATITUDE'],
        original_row['EPSG6599_LONGITUDE'],
        original_row['BOTTOM_DEPTH_FT'],
        original_row['SYMBOL_LITHOLOGY']
    ]
    new_rows.append(top_depth_row)
    new_rows.append(bottom_depth_row)

    layer_depth_ft = bottom_depth_ft - top_depth_ft
    if layer_depth_ft >= MIN_THICKNESS_FOR_INTERNAL_POINTS_FT:
        for ft in np.arange(top_depth_ft, bottom_depth_ft, PREDICTION_INCREMENT_FT):
            layer_row = [
                original_row['EPSG6599_LATITUDE'],
                original_row['EPSG6599_LONGITUDE'],
                ft,
                original_row['SYMBOL_LITHOLOGY']
            ]
            new_rows.append(layer_row)
    else:
        midpoint_depth_ft = (top_depth_ft + bottom_depth_ft) / 2
        midpoint_depth_row = [
            original_row['EPSG6599_LATITUDE'],
            original_row['EPSG6599_LONGITUDE'],
            midpoint_depth_ft,
            original_row['SYMBOL_LITHOLOGY']
        ]
        new_rows
    processed_training_data.extend(new_rows)

In [9]:
training_df = pd.DataFrame(processed_training_data, columns=processed_data_columns)

In [10]:
min_samples_for_stratify = 2
test_size_proportion = 0.25
random_seed = 42

In [11]:
print("\nChecking class counts before filtering...")
class_counts = training_df[lithology_col].value_counts()
print("Original Class Distribution (Top 20):")
print(class_counts.head(20)) # Print top common classes

classes_to_remove = class_counts[class_counts < min_samples_for_stratify].index.tolist()


Checking class counts before filtering...
Original Class Distribution (Top 20):
SYMBOL_LITHOLOGY
Silty sand              523796
Sand                    219004
Sandy silt              124880
Silt                     92440
Gravelly sand            77546
Clayey silt              57867
Silty clay               56574
Topsoil / vegetation     45314
Clay                     38239
Gravel                   35948
Undefined                26443
Silty gravel             26412
Peat                     22274
Fill                     21751
Clayey sand              19030
Asphalt / concrete       18868
Sedimentary bedrock      13300
Gravelly silt             9850
Sandy clay                9622
Sandy gravel              8534
Name: count, dtype: int64


In [12]:
if classes_to_remove:
    print(f"\nFound classes with less than {min_samples_for_stratify} members: {classes_to_remove}. These will be removed for stratification.")
    df_filtered = training_df[~training_df[lithology_col].isin(classes_to_remove)].copy()
    print(f"Data shape after removing rare classes: {df_filtered.shape}")
    print("Updated Class Distribution (Top 20):")
    print(df_filtered[lithology_col].value_counts().head(20))
else:
    print("\nNo classes with less than 2 members found. Proceeding with split.")
    df_filtered = training_df.copy()


No classes with less than 2 members found. Proceeding with split.


In [15]:
dem_df = pd.read_csv('../../Documents/borehole_data/borehole_dem_features_data.csv')

In [None]:
dem_df = dem_df.drop(['BOREHOLE_ID', 'LONGITUDE', 'LATITUDE'], axis=1)

Index(['LONGITUDE', 'LATITUDE', 'BOREHOLE_ID', 'EPSG6599_LATITUDE',
       'EPSG6599_LONGITUDE', 'DEM_Elevation_Feet', 'DEM_Slope_Degrees',
       'DEM_Aspect_Degrees'],
      dtype='object')

In [17]:
final_training_df = pd.merge(df_filtered, dem_df, how='left', on=['EPSG6599_LATITUDE', 'EPSG6599_LONGITUDE'])

In [None]:
initial_rows = final_training_df.shape[0]
final_training_df.dropna(inplace=True)
rows_after_nan_drop = final_training_df.shape[0]
if initial_rows > rows_after_nan_drop:
        print(f"Dropped {initial_rows - rows_after_nan_drop} rows with missing essential data.")

Dropped 347209 rows with missing essential data.


In [20]:
final_training_df.to_csv(output_csv_path, index=False)

In [21]:
lithology_group_map = {
    '(Metamorphic bedrock)': 'Bedrock',
    'Plutonic bedrock': 'Bedrock',
    'Sedimentary bedrock': 'Bedrock',
    'Volcanic bedrock': 'Bedrock',
    'Undifferentiated rock': 'Bedrock',

    'Asphalt / concrete': 'Anthropogenic',
    'Debris': 'Anthropogenic',
    'Fill': 'Anthropogenic',
    'Topsoil / vegetation': 'Anthropogenic', # Or maybe 'Near-Surface' depending on context

    'Peat': 'Peat', # Often kept separate due to unique properties

    'Clay': 'Clayey Soils',
    'Silty clay': 'Clayey Soils',
    'Sandy clay': 'Clayey Soils',
    'Gravelly clay': 'Clayey Soils',

    'Silt': 'Silty Soils',
    'Clayey silt': 'Silty Soils',
    'Sandy silt': 'Silty Soils',
    'Gravelly silt': 'Silty Soils',

    'Sand': 'Sand (with Fines)', # Naming implies potential fines, but sand is dominant
    'Silty sand': 'Sand (with Fines)',
    'Clayey sand': 'Sand (with Fines)',

    'Gravel': 'Gravel (with Fines)', # Naming implies potential fines, but gravel is dominant
    'Sandy gravel': 'Gravel (with Fines)',
    'Silty gravel': 'Gravel (with Fines)',
    'Clayey gravel': 'Gravel (with Fines)',
    'Cobbles / boulders': 'Gravel (with Fines)', # Or potentially a separate 'Coarse Aggregate' group

    # 'Gravelly sand': 'Mixed Sand & Gravel', # Keeping this potentially separate as a common mix

    'Undefined': 'Undefined/Remove', # Mark for removal
    'Volcanic ash': 'Undefined/Remove' # Mark for removal due to rarity
}

In [22]:
final_training_df[grouped_lithology_col] = final_training_df[lithology_col].map(lithology_group_map)

In [23]:
final_training_df = final_training_df[final_training_df[grouped_lithology_col] != 'Undefined/Remove'].copy()

In [24]:
print("Checking grouped class counts before filtering...")
grouped_class_counts = final_training_df[grouped_lithology_col].value_counts()
print("Grouped Class Distribution:")
print(grouped_class_counts)

Checking grouped class counts before filtering...
Grouped Class Distribution:
Grouped_Lithology
Sand (with Fines)      586810
Silty Soils            221088
Clayey Soils            81608
Anthropogenic           67969
Gravel (with Fines)     53054
Peat                    16942
Bedrock                 12879
Name: count, dtype: int64


In [28]:
final_training_df.to_csv('../data/boundary_2p5_interval_grouped_lithology_dem_training_data.csv', index=False)

In [None]:
surface_slope_col = 'Surface_Slope' # Name of the new column from DEM features file
surface_aspect_col = 'Surface_Aspect' # Name of the new column from DEM features file

# Define features for the model (updated to include DEM features)
# feature_cols = [lat_col, lon_col, midpoint_depth_col, midpoint_elev_col, surface_slope_col, surface_aspect_col]

# New target variable name (grouped lithology)
grouped_lithology_col = 'Grouped_Lithology'

# Define the proportion of data to use for testing
test_size_proportion = 0.25

# Define a random state for reproducibility
random_seed = 42

# Minimum samples required for a grouped class to be included in training/testing with stratification
min_samples_for_stratify = 10 # Increased minimum threshold as we have more data

    # --- Separate features (X) and target (y) on the filtered data ---
    X = df_filtered[feature_cols]
    y = df_filtered[grouped_lithology_col]

    print(f"\nFeatures shape (X): {X.shape}")
    print(f"Target shape (y): {y.shape}")
    print(f"Target variable unique values after cleaning: {y.unique().shape[0]} unique grouped lithology types.")


    # --- Perform the train-test split ---
    print(f"\nSplitting data into training ({1 - test_size_proportion:.0%}) and testing ({test_size_proportion:.0%})...")

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size_proportion,
        random_state=random_seed,
        stratify=y # Stratify should now work on grouped lithologies
    )

    print("Data split complete.")
    print(f"Training features shape (X_train): {X_train.shape}")
    print(f"Testing features shape (X_test): {X_test.shape}")
    print(f"Training target shape (y_train): {y_train.shape}")
    print(f"Testing target shape (y_test): {y_test.shape}")

    # Check class distribution in train/test sets
    print("\nGrouped Class distribution comparison (Training vs. Testing):")
    train_dist = y_train.value_counts(normalize=True)
    test_dist = y_test.value_counts(normalize=True)
    dist_comparison = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
    print(dist_comparison)


    # --- Proceed to Model Training with X_train, X_test, y_train, y_test ---
    # The next code block will use these variables.

except FileNotFoundError as e:
    print(f"Error: Input file not found - {e}. Please check file paths.")
except KeyError as e:
    print(f"Error: Column '{e}' not found in one of the dataframes. Please check column names in configuration and input files.")
except ValueError as e:
    print(f"Data or Configuration Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
if df_filtered.shape[0] == 0:
        raise ValueError("No data remaining after cleaning and removing rare classes. Cannot perform split.")

# --- Separate features (X) and target (y) on the filtered data ---
X = df_filtered[feature_cols]
y = df_filtered[lithology_col]

print(f"\nFeatures shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"Target variable unique values after cleaning: {y.unique().shape[0]} unique lithology types.")


# --- Perform the train-test split ---
print(f"\nSplitting data into training ({1 - test_size_proportion:.0%}) and testing ({test_size_proportion:.0%})...")

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=test_size_proportion,
    random_state=random_seed,
    stratify=y # Stratify should now work
)

print("Data split complete.")
print(f"Training features shape (X_train): {X_train.shape}")
print(f"Testing features shape (X_test): {X_test.shape}")
print(f"Training target shape (y_train): {y_train.shape}")
print(f"Testing target shape (y_test): {y_test.shape}")

# Check class distribution in train/test sets
print("\nClass distribution comparison (Training vs. Testing):")
train_dist = y_train.value_counts(normalize=True)
test_dist = y_test.value_counts(normalize=True)
dist_comparison = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
print(dist_comparison.head()) # Print comparison for top classes