In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [51]:
lith_data = pd.read_csv('../data/full_grouped_lithology_training_data.csv')

  lith_data = pd.read_csv('../data/full_grouped_lithology_training_data.csv')


In [52]:
lat_col = 'LATITUDE'
lon_col = 'LONGITUDE'
collar_elev_col = 'ELEVATION_FT'
top_depth_col = 'TOP_DEPTH_FT'
bottom_depth_col = 'BOTTOM_DEPTH_FT'
lithology_col = 'SYMBOL_LITHOLOGY'
grouped_lithology_col = 'Grouped_Lithology'

midpoint_depth_col = 'Midpoint_Depth_ft'
midpoint_elev_col = 'Midpoint_Elevation_ft'

feature_cols = [lat_col, lon_col, midpoint_depth_col, midpoint_elev_col]

target_col = grouped_lithology_col
test_size_proportion = 0.25
random_seed = 42

In [53]:
if lith_data.shape[0] == 0:
        raise ValueError("No data remaining after cleaning and removing rare classes. Cannot perform split.")

# --- Separate features (X) and target (y) on the filtered data ---
X = lith_data[feature_cols]
y = lith_data[grouped_lithology_col]

print(f"\nFeatures shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"Target variable unique values after cleaning: {y.unique().shape[0]} unique lithology types.")


# --- Perform the train-test split ---
print(f"\nSplitting data into training ({1 - test_size_proportion:.0%}) and testing ({test_size_proportion:.0%})...")

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=test_size_proportion,
    random_state=random_seed,
    stratify=y # Stratify should now work
)

print("Data split complete.")
print(f"Training features shape (X_train): {X_train.shape}")
print(f"Testing features shape (X_test): {X_test.shape}")
print(f"Training target shape (y_train): {y_train.shape}")
print(f"Testing target shape (y_test): {y_test.shape}")

# Check class distribution in train/test sets
print("\nClass distribution comparison (Training vs. Testing):")
train_dist = y_train.value_counts(normalize=True)
test_dist = y_test.value_counts(normalize=True)
dist_comparison = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
print(dist_comparison.head()) # Print comparison for top classes


Features shape (X): (319114, 4)
Target shape (y): (319114,)
Target variable unique values after cleaning: 9 unique lithology types.

Splitting data into training (75%) and testing (25%)...
Data split complete.
Training features shape (X_train): (239335, 4)
Testing features shape (X_test): (79779, 4)
Training target shape (y_train): (239335,)
Testing target shape (y_test): (79779,)

Class distribution comparison (Training vs. Testing):
                         Train      Test
Grouped_Lithology                       
Sand (with Fines)     0.535024  0.535028
Silty Soils           0.204855  0.204853
Topsoil / vegetation  0.069033  0.069028
Clayey Soils          0.058241  0.058236
Gravel (with Fines)   0.057522  0.057521


In [54]:
X = lith_data[feature_cols]
y = lith_data[target_col]

In [55]:
print(f"\nFeatures shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"Target variable unique values: {y.unique()}")


Features shape (X): (319114, 4)
Target shape (y): (319114,)
Target variable unique values: ['Anthropogenic Fill/Debris' 'Silty Soils' 'Topsoil / vegetation'
 'Sand (with Fines)' 'Clayey Soils' 'Gravel (with Fines)'
 'Asphalt / concrete' 'Peat' 'Bedrock']


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

In [57]:
model = RandomForestClassifier(n_estimators=150, random_state=random_seed, class_weight='balanced', n_jobs=-1)

In [58]:
print("\nStarting Lithology Classification Model Training (Random Forest)...")
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.2f} seconds.")


Starting Lithology Classification Model Training (Random Forest)...
Training complete in 24.50 seconds.


In [59]:
print("\nMaking predictions on the testing data...")
y_pred = model.predict(X_test)
print("Predictions complete.")


Making predictions on the testing data...
Predictions complete.


In [60]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.4f}")

Accuracy on the test set: 0.6682


In [61]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                           precision    recall  f1-score   support

Anthropogenic Fill/Debris       0.58      0.27      0.37      1559
       Asphalt / concrete       0.76      0.76      0.76      2325
                  Bedrock       0.70      0.59      0.64       806
             Clayey Soils       0.52      0.34      0.41      4646
      Gravel (with Fines)       0.54      0.31      0.40      4589
                     Peat       0.49      0.27      0.35      1320
        Sand (with Fines)       0.71      0.85      0.77     42684
              Silty Soils       0.54      0.42      0.47     16343
     Topsoil / vegetation       0.75      0.76      0.76      5507

                 accuracy                           0.67     79779
                macro avg       0.62      0.51      0.55     79779
             weighted avg       0.65      0.67      0.65     79779



In [62]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[  421    38     9    20    36     3   840   129    63]
 [   11  1766     0     0    86     0   168    29   265]
 [    3     0   476    19    12     1   209    83     3]
 [   17     2    15  1589    43    25  1860  1073    22]
 [   25   152    24    52  1444    11  2362   371   148]
 [    7     1     0    31    27   352   690   181    31]
 [  148   120   108   724   727   173 36219  3876   589]
 [   67    31    51   640   205   136  8125  6831   257]
 [   24   226     0    10    75    12   800   153  4207]]


In [63]:
print("\nFeature Importances:")
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances)


Feature Importances:
LONGITUDE                0.264755
LATITUDE                 0.261316
Midpoint_Depth_ft        0.243953
Midpoint_Elevation_ft    0.229976
dtype: float64
