In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
lith_data = pd.read_csv('../data/lithology_training_points.csv')

In [13]:
feature_cols = ['Latitude', 'Longitude', 'Midpoint Elevation (ft)']
target_col = 'Lithology'
test_size_proportion = 0.25
random_seed = 42

In [8]:
initial_rows = lith_data.shape[0]
lith_data.dropna(subset=[target_col], inplace=True)
rows_after_nan_drop = lith_data.shape[0]

In [15]:
print("\nChecking class counts before splitting...")
class_counts = lith_data[target_col].value_counts()
print("Original Class Distribution:")
print(class_counts)

classes_to_remove = class_counts[class_counts == 1].index.tolist()

if classes_to_remove:
    print(f"\nFound classes with only 1 member: {classes_to_remove}. These will be removed.")
    lith_data = lith_data[~lith_data[target_col].isin(classes_to_remove)].copy()
    print(f"Data shape after removing single-member classes: {lith_data.shape}")
    print("Updated Class Distribution:")
    print(lith_data[target_col].value_counts())
else:
    print("\nNo classes with only 1 member found. Proceeding with split.")


Checking class counts before splitting...
Original Class Distribution:
Lithology
Silty sand               2206
Sand                     2194
Fill                      985
Silt                      880
Sandy silt                701
Clayey silt               423
Gravelly sand             377
Silty clay                348
Sandy gravel              313
Clay                      231
Gravel                    192
Peat                      141
Topsoil / vegetation       81
Sandy clay                 79
Silty gravel               71
Undefined                  66
Asphalt / concrete         45
Clayey sand                44
Gravelly silt              27
Sedimentary bedrock        14
Clayey gravel               9
Gravelly clay               7
Debris                      7
Volcanic ash                5
Undifferentiated rock       3
Volcanic bedrock            2
Cobbles / boulders          2
Metamorphic bedrock         1
Name: count, dtype: int64

Found classes with only 1 member: ['Metamorphic bed

In [16]:
X = lith_data[feature_cols]
y = lith_data[target_col]

In [18]:
print(f"\nFeatures shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"Target variable unique values: {y.unique()}")


Features shape (X): (9453, 3)
Target shape (y): (9453,)
Target variable unique values: ['Fill' 'Clay' 'Sandy silt' 'Sand' 'Clayey silt' 'Silty sand' 'Undefined'
 'Asphalt / concrete' 'Silt' 'Peat' 'Silty clay' 'Gravelly sand'
 'Clayey sand' 'Topsoil / vegetation' 'Gravel' 'Sandy clay' 'Sandy gravel'
 'Gravelly silt' 'Silty gravel' 'Debris' 'Sedimentary bedrock'
 'Gravelly clay' 'Volcanic ash' 'Volcanic bedrock' 'Clayey gravel'
 'Cobbles / boulders' 'Undifferentiated rock']


In [20]:
print(f"\nSplitting data into training ({1 - test_size_proportion:.0%}) and testing ({test_size_proportion:.0%})...")
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=test_size_proportion,
    random_state=random_seed,
    stratify=y # Use stratify to ensure class proportions are similar in train and test sets
                # IMPORTANT for classification, especially if some classes are rare
)


Splitting data into training (75%) and testing (25%)...


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

In [22]:
model = RandomForestClassifier(n_estimators=100, random_state=random_seed, class_weight='balanced', n_jobs=-1)

In [23]:
print("\nStarting Lithology Classification Model Training (Random Forest)...")
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.2f} seconds.")


Starting Lithology Classification Model Training (Random Forest)...
Training complete in 0.31 seconds.


In [24]:
print("\nMaking predictions on the testing data...")
y_pred = model.predict(X_test)
print("Predictions complete.")


Making predictions on the testing data...
Predictions complete.


In [25]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.4f}")

Accuracy on the test set: 0.3570


In [26]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                       precision    recall  f1-score   support

   Asphalt / concrete       0.50      0.18      0.27        11
                 Clay       0.31      0.26      0.28        58
        Clayey gravel       0.00      0.00      0.00         2
          Clayey sand       0.00      0.00      0.00        11
          Clayey silt       0.20      0.19      0.20       106
   Cobbles / boulders       0.00      0.00      0.00         0
               Debris       0.50      0.50      0.50         2
                 Fill       0.58      0.53      0.56       246
               Gravel       0.24      0.21      0.22        48
        Gravelly clay       0.00      0.00      0.00         2
        Gravelly sand       0.26      0.24      0.25        94
        Gravelly silt       0.00      0.00      0.00         7
                 Peat       0.17      0.11      0.14        35
                 Sand       0.41      0.44      0.43       549
           Sandy clay       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[  2   0   0   0   0   0   0   2   1   0   1   1   0   1   0   0   1   0
    1   0   0   1   0   0   0   0   0]
 [  0  15   0   1   1   0   0   2   1   0   1   0   0   5   1   1  11   0
    7   6   0   6   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   1   0   0   0   0   0]
 [  0   0   0   0   0   0   0   1   0   0   0   0   0   0   2   2   0   0
    0   3   0   3   0   0   0   0   0]
 [  0   1   0   0  20   0   0  12   0   0   3   0   3  11   2   0   8   0
   11  12   0  23   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   0   0   0   0   0   0   1   0   0   0   0
    0   0   0   0   0   0   0   0   0]
 [  0   1   0   2   2   0   1 131   0   0   7   0   1  33   1   5   8   0
   12   3   0  38   0   1   0   0   0]
 [  0   1   0   0   1   0   0   2  10   0   1   1   0  20   0   0   1   0
   

In [28]:
print("\nFeature Importances:")
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances)


Feature Importances:
Midpoint Elevation (ft)    0.385152
Longitude                  0.308968
Latitude                   0.305880
dtype: float64
