In [9]:
#importing required libraries for this project:-
import os
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

# **Tools & Libraries Used**
1. Python (main programming language)

2. TensorFlow/Keras (for building the neural network)

3. NumPy & Pandas (for data handling)

4. scikit-learn (for splitting data and evaluation)

5. OpenCV (for image processing)

In [10]:

# Constants
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 30

# Data paths (adjust to your actual paths)
TRAIN_DIR = '/kaggle/input/soil-classification/soil_classification-2025/train'
TEST_DIR = '/kaggle/input/soil-classification/soil_classification-2025/test'
TRAIN_CSV = '/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv'



#  Configuration Settings

Image Settings:
  - `IMG_SIZE = (224, 224)` → All soil images will be resized to 224x224 pixels
  - `BATCH_SIZE = 32` → Processes 32 images at once during training

Training Control:
  - `EPOCHS = 30` → The model will go through the entire dataset 30 times

File Paths:
  - `TRAIN_DIR` → Location of training images
  - `TEST_DIR` → Location of test images
  - `TRAIN_CSV` → File containing image labels (what soil type each image shows)



In [11]:
# Load and prepare data
train_df = pd.read_csv(TRAIN_CSV)
test_files = [f for f in os.listdir(TEST_DIR) if f.endswith(('.jpg', '.jpeg', '.png'))]
test_df = pd.DataFrame({'image_id': test_files})

# Verifying classes 
print("Training data class distribution:")
print(train_df['soil_type'].value_counts(normalize=True).apply(lambda x: f"{x:.1%}"))

Training data class distribution:
soil_type
Alluvial soil    43.2%
Red soil         21.6%
Black Soil       18.9%
Clay soil        16.3%
Name: proportion, dtype: object


**loads soil image data from files & checks distribution of different soil types in training set:**


1. Reads the training labels from CSV
2. Finds all test images in the test folder
3. Shows what percentage of each soil type we have in our training data


In [12]:
# Data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.2
)

test_datagen = ImageDataGenerator(rescale=1./255)


# Create generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=TRAIN_DIR,
    x_col='image_id',
    y_col='soil_type',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training',
    shuffle=True
)

val_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=TRAIN_DIR,
    x_col='image_id',
    y_col='soil_type',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation',
    shuffle=False
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=TEST_DIR,
    x_col='image_id',
    y_col=None,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode=None,
    shuffle=False
)

Found 972 validated image filenames belonging to 4 classes.
Found 242 validated image filenames belonging to 4 classes.




Found 339 validated image filenames.


#  Data Preparation

- **Image Processing**:
  - Normalizes pixel values (0-1) by dividing by 255
  - Adds variations to training images:
    - Random rotations (up to 20 degrees)
    - Small left/right and up/down shifts
    - Horizontal flipping
  - Keeps test images unchanged (only normalizes them)

- **Data Generators**:
  - `train_generator`: 
    - Takes images from training folder
    - Uses 80% of data for actual training
    - Shuffles images for better learning
  - `val_generator`: 
    - Uses remaining 20% for validation
    - Doesn't shuffle (for proper evaluation)
  - `test_generator`:
    - Only for final testing
    - No labels needed (for predictions)

In [15]:
# Model architecture
model = Sequential([
    Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(train_generator.class_indices), activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Our Soil Classification Model

**Model Structure:**

- Input Layer: Takes soil images (224x224 pixels with 3 color channels)
- Convolution Blocks (Feature Extractors):
  - Each block has:
    - Conv2D layer (detects patterns like edges/textures)
    - MaxPooling (reduces image size while keeping important features)
  - Progressively increases filters (32 → 64 → 128) to catch complex patterns

**Classifier Part:**
  - Flatten layer (converts 2D features to 1D)
  - Dense layer (128 neurons) for decision making
  - Dropout (0.5) to prevent overfitting (randomly ignores half the neurons)
  - Final output layer (softmax) gives probability for each soil type


**Why This Works:**

- Simple but effective CNN architecture
- Gets progressively smarter about soil features
- Balanced to avoid memorizing (dropout helps)
- Standard settings that usually work well

In [18]:
# Callbacks
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', save_best_only=True)
]

# Train model
history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=val_generator,
    callbacks=callbacks
)

Epoch 1/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 2s/step - accuracy: 0.8405 - loss: 0.3759 - val_accuracy: 0.8471 - val_loss: 0.3065
Epoch 2/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.8367 - loss: 0.4009 - val_accuracy: 0.6777 - val_loss: 0.8216
Epoch 3/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.8701 - loss: 0.3792 - val_accuracy: 0.8347 - val_loss: 0.3688
Epoch 4/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2s/step - accuracy: 0.8714 - loss: 0.3229 - val_accuracy: 0.7975 - val_loss: 0.5321
Epoch 5/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.9009 - loss: 0.2707 - val_accuracy: 0.8347 - val_loss: 0.3486
Epoch 6/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.8848 - loss: 0.2799 - val_accuracy: 0.8306 - val_loss: 0.4692


#  Training Our Model

 Smart Training Helpers (Callbacks):
- **Early Stopping**:
  - Stops training if model doesn't improve for 5 straight rounds
  - Keeps the best version of the model
- **Model Checkpoint**:
  - Saves the best model automatically as 'best_model.h5'
  - Only keeps the best version (saves disk space)

Training Process:
- Runs for maximum 30 rounds (epochs)
- Uses:
  - Training data (to learn patterns)
  - Validation data (to check progress)
- Automatically stops if not improving 

Why This Matters:
- Prevents wasting time on useless training
- Ensures we keep the best model version


In [19]:
# Generate predictions 
test_preds = model.predict(test_generator)
predicted_classes = np.argmax(test_preds, axis=1)
class_labels = list(train_generator.class_indices.keys())

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 520ms/step


*This code generates soil type predictions on test images, converting the model's probability outputs into specific class predictions. It identifies the most likely soil class for each image and maps these predictions to the actual class names used during training. The final output tells us which soil type the model believes each test image contains.*

In [23]:

# creating submission DataFrame 
submission = pd.DataFrame({
    'image_id': test_df['image_id'],
    'soil_type': [class_labels[i] for i in predicted_classes]
})

# Add summary statistics
print("\nPrediction distribution:")
print(submission['soil_type'].value_counts(normalize=True).apply(lambda x: f"{x:.1%}"))

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("\nFirst 5 predictions:")
print(submission.head())


Prediction distribution:
soil_type
Black Soil       31.0%
Red soil         28.9%
Clay soil        26.8%
Alluvial soil    13.3%
Name: proportion, dtype: object

First 5 predictions:
           image_id      soil_type
0  img_0f035b97.jpg  Alluvial soil
1  img_f13af256.jpg      Clay soil
2  img_15b41dbc.jpg  Alluvial soil
3  img_cfb4fc7a.jpg     Black Soil
4  img_683111fb.jpg     Black Soil


 *This code organizes our model's predictions into a submission file. It creates a table matching each test image with its predicted soil type, shows what percentage of each soil type was predicted, then saves everything to a CSV file while displaying a sample of the first 5 predictions.*