In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

soil_classification_part_2_path = kagglehub.competition_download('soil-classification-part-2')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/soil-classification-part-2/soil_competition-2025/test'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns

**LOADING DATASETS AND DATA PREPROCESSING**

In [None]:
# Loading Datasets
soil_img_dir = "/kaggle/input/soil-classification-part-2/soil_competition-2025/train"
soil_labels_path = "/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv"
generated_not_soil_dir = "/kaggle/working/generated_not_soil"
img_size = (224, 224)
batch_size = 32

# Generating Synthetic "Not Soil" Images
def generate_non_soil_images(save_dir, num_images=500, size=(224, 224)):
    os.makedirs(save_dir, exist_ok=True)
    for i in tqdm(range(num_images), desc="Generating non-soil images"):
        img = np.zeros((size[0], size[1], 3), dtype=np.uint8)
        pattern = np.random.choice(["noise", "stripes", "grid", "gradient", "blobs"])

        if pattern == "noise":
            img = np.random.randint(0, 256, img.shape, dtype=np.uint8)

        elif pattern == "stripes":
            c1, c2 = np.random.randint(0, 256, 3), np.random.randint(0, 256, 3)
            w = np.random.randint(5, 20)
            for y in range(0, size[1], 2*w):
                img[:, y:y+w] = c1
                img[:, y+w:y+2*w] = c2

        elif pattern == "grid":
            color = np.random.randint(0, 256, 3)
            spacing = np.random.randint(10, 30)
            img[:] = 255
            for x in range(0, size[0], spacing): img[x:x+2, :] = color
            for y in range(0, size[1], spacing): img[:, y:y+2] = color

        elif pattern == "gradient":
            for i in range(3):
                lin = np.linspace(0, 255, size[0])
                grad = np.tile(lin, (size[1], 1)).T
                img[..., i] = grad.astype(np.uint8)

        elif pattern == "blobs":
            img[:] = np.random.randint(0, 50, 3)
            for _ in range(np.random.randint(5, 20)):
                x, y = np.random.randint(0, size[0]), np.random.randint(0, size[1])
                r = np.random.randint(10, 50)
                color = np.random.randint(100, 255, 3).tolist()
                cv2.circle(img, (x, y), r, color, -1)

        Image.fromarray(img).save(os.path.join(save_dir, f"nonsoil_{i}.jpg"))

generate_non_soil_images(generated_not_soil_dir, num_images=500)

In [None]:
# Load Soil Labels
soil_df = pd.read_csv(soil_labels_path)
soil_df["label"] = 1
soil_df["full_path"] = soil_df["image_id"].apply(lambda x: os.path.join(soil_img_dir, x))

# Create "Not Soil" Labels
non_soil_files = os.listdir(generated_not_soil_dir)
non_soil_df = pd.DataFrame({
    "image_id": non_soil_files,
    "label": 0,
    "full_path": [os.path.join(generated_not_soil_dir, fname) for fname in non_soil_files]
})

In [None]:
# Combining Dataset
full_df = pd.concat([soil_df[["image_id", "label", "full_path"]], non_soil_df], ignore_index=True)

# Spliting Data
train_df, val_df = train_test_split(full_df, test_size=0.2, stratify=full_df["label"], random_state=42)

# Class Weights
class_weights_array = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label']), y=train_df['label'])
class_weights = dict(enumerate(class_weights_array))

# Image Data Generators
train_datagen = ImageDataGenerator(rescale=1./255, zoom_range=0.2, horizontal_flip=True)
val_datagen = ImageDataGenerator(rescale=1./255)

train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)

train_gen = train_datagen.flow_from_dataframe(
    train_df,
    x_col='full_path',
    y_col='label',
    target_size=img_size,
    class_mode='binary',
    batch_size=batch_size,
    shuffle=True
)

val_gen = val_datagen.flow_from_dataframe(
    val_df,
    x_col='full_path',
    y_col='label',
    target_size=img_size,
    class_mode='binary',
    batch_size=batch_size,
    shuffle=False
)

**CNN MODEL**

In [None]:
# Building CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Training the CNN model
early_stop = EarlyStopping(patience=3, restore_best_weights=True)

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=10,
    class_weight=class_weights,
    callbacks=[early_stop]
)

**PREDICTIONS**

In [None]:
# Evaluation with F1 Score
#Predicting on validation set
val_preds = model.predict(val_gen)
y_pred = (val_preds > 0.5).astype(int).flatten()
y_true = val_gen.classes

report = classification_report(y_true, y_pred, target_names=['Not Soil', 'Soil'], output_dict=True)
f1_soil = report["Soil"]["f1-score"]
f1_not_soil = report["Not Soil"]["f1-score"]
min_f1 = min(f1_soil, f1_not_soil)

print("F1 Scores:")
print("Soil       :", round(f1_soil, 4))
print("Not Soil   :", round(f1_not_soil, 4))
print("Minimum F1 :", round(min_f1, 4))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report, f1_score
conf_matrix = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Non-soil', 'Soil'],
            yticklabels=['Non-soil', 'Soil'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
#Predicting on test data
test_dir = "/kaggle/input/soil-classification-part-2/soil_competition-2025/test"
test_csv = "/kaggle/input/soil-classification-part-2/soil_competition-2025/test_ids.csv"
sample_submission_path = "/kaggle/input/soil-classification-part-2/soil_competition-2025/sample_submission.csv"


test_df = pd.read_csv(test_csv)
test_df['full_path'] = test_df['image_id'].apply(lambda x: os.path.join(test_dir, x))
test_df['label'] = '0'  # Dummy label required for flow_from_dataframe

#Creating Test Generator
img_size = (224, 224)
batch_size = 32

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    test_df,
    x_col='full_path',
    y_col='label',
    target_size=img_size,
    class_mode=None,
    shuffle=False,
    batch_size=batch_size
)

#Predicting Using the Current Model in Memory
test_probs = model.predict(test_generator, verbose=1)
test_preds = (test_probs > 0.5).astype(int).flatten()

#Save to Submission File
submission = test_df[['image_id']].copy()
submission['label'] = test_preds
submission.to_csv("submission.csv", index=False)

In [None]:
print("Submission saved! Shape:", submission.shape)
print(submission.head())