In [10]:
# Part 1: Setup and Imports

import os
import pandas as pd
import numpy as np
import cv2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Constants
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 10


In [11]:
# Part 2: Paths
DATA_DIR = "/kaggle/input/soil-classification/soil_classification-2025"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR = os.path.join(DATA_DIR, "test")
LABELS_CSV = os.path.join(DATA_DIR, "train_labels.csv")
TEST_IDS_CSV = os.path.join(DATA_DIR, "test_ids.csv")
SAMPLE_SUBMISSION_CSV = os.path.join(DATA_DIR, "sample_submission.csv")


In [12]:
# Part 3: Load CSVs
df = pd.read_csv(LABELS_CSV)
test_ids = pd.read_csv(TEST_IDS_CSV)

# Encode labels into numbers
le = LabelEncoder()
df['label'] = le.fit_transform(df['soil_type'])

# One-hot encode labels
labels = to_categorical(df['label'])

# Split into train and validation sets
train_ids, val_ids, train_labels, val_labels = train_test_split(df['image_id'], labels, test_size=0.2, random_state=42)


In [13]:
# Part 4: Image loading function
def load_images(image_ids, directory):
    images = []
    for img_id in image_ids:
        path = os.path.join(directory, img_id)
        image = cv2.imread(path)
        image = cv2.resize(image, IMAGE_SIZE)
        images.append(image)
    return np.array(images)

# Load training and validation images
X_train = load_images(train_ids, TRAIN_DIR)
X_val = load_images(val_ids, TRAIN_DIR)


In [14]:
# Part 5: CNN Model
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-05-22 17:25:05.662004: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [15]:
# Part 6: Training
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop]
)


Epoch 1/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/step - accuracy: 0.4678 - loss: 737.6035 - val_accuracy: 0.7102 - val_loss: 1.5186
Epoch 2/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2s/step - accuracy: 0.7660 - loss: 1.2839 - val_accuracy: 0.8163 - val_loss: 0.9743
Epoch 3/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.8712 - loss: 0.8751 - val_accuracy: 0.8449 - val_loss: 0.8091
Epoch 4/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.8758 - loss: 0.7145 - val_accuracy: 0.8612 - val_loss: 0.7941
Epoch 5/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.9200 - loss: 0.3823 - val_accuracy: 0.8776 - val_loss: 0.6514
Epoch 6/10
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.9225 - loss: 0.2891 - val_accuracy: 0.8776 - val_loss: 0.7380
Epoch 7/10
[1m31/31[0m [32m━━━━━━━━

In [16]:
# Part 7: Predict on Test Set
test_images = load_images(test_ids['image_id'], TEST_DIR)
predictions = model.predict(test_images)
predicted_labels = np.argmax(predictions, axis=1)
predicted_soil_types = le.inverse_transform(predicted_labels)


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 450ms/step


In [18]:
# Corrected Part 8: Create Submission File using test_ids
submission = test_ids.copy()  # test_ids has the correct 341 image IDs
submission['soil_type'] = predicted_soil_types  # Add predictions
submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
