In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os

import zipfile
import tensorflow as tf

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from PIL import Image

2024-03-06 20:59:20.642676: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 20:59:20.642814: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 20:59:20.802709: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Define paths to dataset files 
base_path = '/kaggle/input/bttai-nybg-2024'  
train_csv_path = os.path.join(base_path, 'BTTAIxNYBG-train.csv')
validation_csv_path = os.path.join(base_path, 'BTTAIxNYBG-validation.csv')
test_csv_path = os.path.join(base_path, 'BTTAIxNYBG-test.csv')
train_images_path = os.path.join(base_path, 'BTTAIxNYBG-train/BTTAIxNYBG-train')
validation_images_path = os.path.join(base_path, 'BTTAIxNYBG-validation/BTTAIxNYBG-validation')
test_images_path = os.path.join(base_path, 'BTTAIxNYBG-test/BTTAIxNYBG-test')

In [3]:
# Load datasets
train_df = pd.read_csv(train_csv_path)
validation_df = pd.read_csv(validation_csv_path)
test_df = pd.read_csv(test_csv_path)

In [4]:
# Convert the 'classID' column to string in both the training and validation DataFrames
train_df['classID_str'] = train_df['classID'].astype(str)
validation_df['classID_str'] = validation_df['classID'].astype(str)

In [5]:
train_df

Unnamed: 0,uniqueID,classLabel,classID,source,imageFile,classID_str
0,2,occluded-specimens,8,L,a1a8b48e8cb142b3.jpg,8
1,3,microscope-slides,6,L,79599db2ac9092b6.jpg,6
2,4,illustrations-color,2,BHL,c449696f2f0d0d92.jpg,2
3,5,illustrations-color,2,P,80a8f4a393b4e08c.jpg,2
4,6,animal-specimens,0,AK,041a1c6e73313638.jpg,0
...,...,...,...,...,...,...
81941,122874,micrographs-transmission-light,5,Tw,2424355d5438181c.jpg,5
81942,122875,illustrations-color,2,BHL,b28acccccecad04c.jpg,2
81943,122876,microscope-slides,6,L,78f6868694a6669c.jpg,6
81944,122877,live-plants,4,E,5c6162948949510a.jpg,4


In [6]:
train_df.dtypes

uniqueID        int64
classLabel     object
classID         int64
source         object
imageFile      object
classID_str    object
dtype: object

In [7]:
# Prepare ImageDataGenerator for training and validation (adjust target_size for ResNet50)
train_datagen = ImageDataGenerator(rescale=1./255, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True, preprocessing_function=preprocess_input)
validation_datagen = ImageDataGenerator(rescale=1./255, preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(rescale=1./255)
batch_size = 32

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=train_images_path,
    x_col='imageFile',
    y_col='classID_str',
    target_size=(224, 224),  # Adjust for ResNet50
    batch_size=batch_size,
    class_mode='categorical'
)

validate_generator = validation_datagen.flow_from_dataframe(
    dataframe=validation_df,
    directory=validation_images_path,
    x_col='imageFile',
    y_col='classID_str',
    target_size=(224, 224),  # Adjust for ResNet50
    batch_size=batch_size,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=test_images_path,
    x_col='imageFile',
    y_col=None,  # Test set does not have labels
    target_size=(224, 224),  # Adjusted to match the model's expected input shape
    batch_size=32,
    class_mode=None,  # Since we're not using the labels
    shuffle=False  # Important to keep the same order as the input dataframe
)

Found 81946 validated image filenames belonging to 10 classes.
Found 10244 validated image filenames belonging to 10 classes.
Found 30690 validated image filenames.


In [8]:
# Define the model architecture using ResNet50
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze the layers

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


In [9]:
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D

model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')  # Adjust the number of units based on the number of classes
])

In [10]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
# Load pre-trained DenseNet model
Dense_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [12]:
# Freeze the layers
Dense_model.trainable = False

In [13]:
# Build your custom model on top of DenseNet
model = Sequential([
    Dense_model,
    GlobalAveragePooling2D(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')  
])

In [14]:
#Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Define EarlyStopping callback, Steps Per Epoch
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min', restore_best_weights=True)
steps_per_epoch = len(train_df) // batch_size + (len(train_df) % batch_size > 0)
validation_steps = len(validation_df) // batch_size + (len(validation_df) % batch_size > 0)


# Add it to the fit function
history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=2,
    validation_data=validate_generator,
    validation_steps=validation_steps,
    callbacks=[early_stopping]
)

Epoch 1/2


  self._warn_if_super_not_called()


[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9543s[0m 4s/step - accuracy: 0.8299 - loss: 0.4991 - val_accuracy: 0.9222 - val_loss: 0.2181
Epoch 2/2


  self.gen.throw(typ, value, traceback)


[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00
Restoring model weights from the end of the best epoch: 2.


In [15]:
tf.debugging.set_log_device_placement(True)

In [16]:
# Calculate steps per epoch and validation steps
steps_per_epoch = len(train_df) // batch_size + (len(train_df) % batch_size > 0)
validation_steps = len(validation_df) // batch_size + (len(validation_df) % batch_size > 0)

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min', restore_best_weights=True)

# Add it to the fit function
history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=10,
    validation_data=validate_generator,
    validation_steps=validation_steps,
    callbacks=[early_stopping]  # Add callbacks here
)

Epoch 1/10
[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9383s[0m 4s/step - accuracy: 0.9074 - loss: 0.2597 - val_accuracy: 0.9210 - val_loss: 0.2266
Epoch 2/10
[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00
Epoch 3/10
[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9490s[0m 4s/step - accuracy: 0.9165 - loss: 0.2400 - val_accuracy: 0.9384 - val_loss: 0.1784
Epoch 4/10
[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 20ms/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.0000e+00 - val_loss: 0.0000e+00
Epoch 5/10
[1m2561/2561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9410s[0m 4s/step - accuracy: 0.9222 - loss: 0.2244 - val_accuracy: 0.9233 - val_loss: 0.2113
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.


In [17]:
# Predict the class probabilities for the test set
test_predictions_prob = model.predict(test_generator)

# Convert probabilities to class indices
test_predictions = np.argmax(test_predictions_prob, axis=1)

# Map class indices to class labels
class_labels = {v: k for k, v in train_generator.class_indices.items()}
test_predicted_labels = [class_labels[idx] for idx in test_predictions]

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'uniqueID': test_df['uniqueID'],  # Ensure this column exists in your test_df
    'classID': test_predicted_labels
})

# Save the submission file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file saved as {submission_filename}")

[1m960/960[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3014s[0m 3s/step
Submission file saved as submission.csv


In [18]:
submission_df.head()

Unnamed: 0,uniqueID,classID
0,1,1
1,9,9
2,10,4
3,14,1
4,16,6
