In [1]:
import pandas as pd
import numpy as np

In [2]:
borehole_df = pd.read_csv('../data/boreholes_reuscs_sptn.csv')

In [18]:
output_csv_path = 'lithology_training_points.csv'

In [16]:
borehole_id_col = 'Borehole ID'
lat_col = 'Latitude'
lon_col = 'Longitude'
layer_num_col = 'Layer #'
layer_top_elev_col = 'Layer Top Elevation (ft)'
layer_bottom_elev_col = 'Layer Bottom Elevation (ft)'
lithology_col = 'Lithology'
midpoint_elev_col = 'Midpoint Elevation (ft)' # Name for the new column

In [12]:
sptn_point_data = borehole_df[['Latitude', 'Longitude', 'Test Elevation (ft)', 'SPTN']]

In [None]:
# sptn_point_data.to_csv('../data/sptn_point_data.csv', index=False)

In [17]:
initial_rows = borehole_df.shape[0]
borehole_df.dropna(subset=[layer_top_elev_col, layer_bottom_elev_col, lithology_col], inplace=True)
rows_after_nan_drop = borehole_df.shape[0]
if initial_rows > rows_after_nan_drop:
    print(f"Dropped {initial_rows - rows_after_nan_drop} rows with missing layer elevation or lithology data.")

Dropped 984 rows with missing layer elevation or lithology data.


In [19]:
print(f"Identifying unique layers per borehole using '{borehole_id_col}' and '{layer_num_col}'...")
layer_df = borehole_df.drop_duplicates(subset=[borehole_id_col, layer_num_col], keep='first').copy()
print(f"Shape after keeping unique layers: {layer_df.shape}")

# Calculate the midpoint elevation for each layer interval
print(f"Calculating midpoint elevation = ('{layer_top_elev_col}' + '{layer_bottom_elev_col}') / 2 ...")
layer_df[midpoint_elev_col] = (layer_df[layer_top_elev_col] + layer_df[layer_bottom_elev_col]) / 2

# Select only the necessary columns for the Lithology training dataset
lithology_points_df = layer_df[[lat_col, lon_col, midpoint_elev_col, lithology_col]].copy()

# --- Save the Result ---
print(f"Saving processed data to {output_csv_path}...")
lithology_points_df.to_csv("../data/" + output_csv_path, index=False)
print("Processing complete.")
print(f"Generated Lithology training data shape: {lithology_points_df.shape}")
print(f"First 5 rows of the generated data:\n{lithology_points_df.head()}")

Identifying unique layers per borehole using 'Borehole ID' and 'Layer #'...
Shape after keeping unique layers: (9454, 14)
Calculating midpoint elevation = ('Layer Top Elevation (ft)' + 'Layer Bottom Elevation (ft)') / 2 ...
Saving processed data to lithology_training_points.csv...
Processing complete.
Generated Lithology training data shape: (9454, 4)
First 5 rows of the generated data:
    Latitude  Longitude  Midpoint Elevation (ft)   Lithology
0  47.568436 -122.40748                    76.55        Fill
1  47.568436 -122.40748                    73.05        Clay
5  47.568436 -122.40748                    69.55  Sandy silt
6  47.568436 -122.40748                    68.30        Sand
7  47.568436 -122.40748                    67.30        Sand


In [None]:
# create lithology training data
"""
For each unique combination of Borehole ID, Layer #, Layer Top Elevation, and Layer Bottom Elevation:
    - Take the Latitude and Longitude of the borehole.
    - Calculate the Midpoint Elevation (ft) = (Layer Top Elevation (ft) + Layer Bottom Elevation (ft)) / 2.
    - Record the Lithology for that interval.
"""
