<a href="https://colab.research.google.com/github/riyaaaarane/KJSIT_annam.ai/blob/main/soil_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

soil_classification_path = kagglehub.competition_download('soil-classification')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/soil-classification/soil_classification-2025/test'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing all the libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

**LOADING AND DATA PREPROCESSING**

In [None]:
# Load labels
label_df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv")

In [None]:
# Checking unique soil types
print("Unique soil types:", label_df['soil_type'].unique())

# To label the possible classification
label_mapping = {
    'Clay soil': 0,
    'Red soil': 1,
    'Alluvial soil': 2,
    'Black Soil': 3
}

label_df = label_df[label_df['soil_type'].isin(label_mapping.keys())]
label_df['label'] = label_df['soil_type'].map(label_mapping).astype(str)

In [None]:
# Image directory
train_img_dir = "/kaggle/input/soil-classification/soil_classification-2025/train/"

# Train-val split
train_df, val_df = train_test_split(label_df, test_size=0.2, stratify=label_df['label'], random_state=42)

In [None]:
# Class weights to handle imbalance
class_weights_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights = dict(enumerate(class_weights_array))
print("Class Weights:", class_weights)

**IMAGE PREPROCESSING WITH DATA AUGMENTATION**

In [None]:
# Data generators
img_size = (224, 224)
batch_size = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.3,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    directory=train_img_dir,
    x_col='image_id',
    y_col='label',
    target_size=img_size,
    class_mode='sparse',
    batch_size=batch_size,
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    val_df,
    directory=train_img_dir,
    x_col='image_id',
    y_col='label',
    target_size=img_size,
    class_mode='sparse',
    batch_size=batch_size,
    shuffle=False
)


**CNN MODEL**

In [None]:
# CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Training the CNN model
early_stop = EarlyStopping(patience=3, restore_best_weights=True)

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10,
    class_weight=class_weights,
    callbacks=[early_stop]
)

**PREDICTIONS**

In [None]:
# Predicting on validation set
val_preds = model.predict(val_generator, verbose=1)
y_pred = np.argmax(val_preds, axis=1)
y_true = val_generator.classes

In [None]:
# Evaluation of the model
target_names = ['Clay soil', 'Red soil', 'Alluvial soil', 'Black Soil']
report = classification_report(
    y_true, y_pred,
    target_names=target_names,
    output_dict=True,
    zero_division=0
)

f1_scores = [report[name]['f1-score'] for name in target_names]
print("F1 scores per class:", f1_scores)
print("Minimum F1-score:", min(f1_scores))

In [None]:
# Output analysis
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt

target_names = ['Clay soil', 'Red soil', 'Alluvial soil', 'Black Soil']

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
fig, ax = plt.subplots(figsize=(6, 6))
disp.plot(ax=ax, cmap='Blues', colorbar=False)
plt.title("Validation Confusion Matrix")
plt.show()


In [None]:
# Creating test ImageDataGenerator like before
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import numpy as np
import os

# Define explicit class index-to-label mapping
label_mapping = {
    0: 'Clay soil',
    1: 'Red soil',
    2: 'Alluvial soil',
    3: 'Black Soil'
}

test_df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv")
test_dir = "/kaggle/input/soil-classification/soil_classification-2025/test"

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=test_dir,
    x_col='image_id',
    y_col=None,
    target_size=(224, 224),
    class_mode=None,
    batch_size=32,
    shuffle=False,
)

# Predictions on test data
test_predictions = model.predict(test_generator)
test_pred_classes = np.argmax(test_predictions, axis=1)

# Map numeric predictions to soil type names
test_pred_soil_types = [label_mapping[i] for i in test_pred_classes]
valid_image_ids = [os.path.basename(fname) for fname in test_generator.filenames]
valid_df = pd.DataFrame({
    'image_id': valid_image_ids,
    'soil_type': test_pred_soil_types
})

# Merge with original test set to preserve full order
submission_df = test_df.merge(valid_df, on='image_id', how='left')

# Fill any missing predictions with most common label
most_common_soil = valid_df['soil_type'].mode()[0]
submission_df['soil_type'] = submission_df['soil_type'].fillna(most_common_soil)
submission_df['soil_type'] = submission_df['soil_type'].astype(str)

submission_df.to_csv("submission.csv", index=False)
print("Submission saved, Shape:", submission_df.shape)


In [None]:
print(submission_df.head())


In [None]:
submission_df.to_csv("submission.csv", index=False)

