# *Importing Essential Libraries*

In [None]:
# ==============================================================================
# CORE DATA SCIENCE LIBRARIES
# ==============================================================================
import numpy as np  # Fundamental library for high-performance numerical computing (arrays, matrices, linear algebra)
import pandas as pd  # The primary tool for data manipulation and analysis, providing DataFrames

# ==============================================================================
# FILE SYSTEM AND UTILITIES
# ==============================================================================
from glob import glob  # Utility to find file pathnames matching a specified pattern (useful for bulk file loading)
import shutil, os  # Modules for interacting with the operating system, including high-level file operations (shutil) and path manipulation (os)
from tqdm.notebook import tqdm  # Displays smart progress bars for loops, ideal for tracking long operations in Jupyter/Colab notebooks

# ==============================================================================
# DATA VISUALIZATION
# ==============================================================================
import matplotlib.pyplot as plt  # Standard plotting library for creating static, interactive, and animated visualizations
import seaborn as sns  # High-level interface for drawing attractive and informative statistical graphics, built on Matplotlib

# ==============================================================================
# MACHINE LEARNING / MODELING
# ==============================================================================
from sklearn.model_selection import GroupKFold  # Scikit-learn cross-validation: ensures that predefined 'groups' (e.g., subjects/patients) are not split across training and testing folds

# *Define Model Hyperparameters*

In [None]:
dim = 512  
fold = 5  

   # *Load Training Data and Initial Inspection*

In [None]:
# Load the main training metadata file (train.csv)
train_df = pd.read_csv(f'../input/vinbigdata-{dim}-image-dataset/vinbigdata/train.csv')

# Display the first few rows of the DataFrame to inspect the data structure, column names, and initial values
train_df.head()

# *Create Full Image File Paths*

In [None]:
# Create image path dynamically
train_df['image_path'] = f'/kaggle/input/vinbigdata-{dim}-image-dataset/vinbigdata/train/'+train_df.image_id+('.png' if dim!='original' else '.jpg')

# Check new column
train_df.head()

# *Filter Out 'No Finding' Class*

In [None]:
# Filter out all rows where the 'class_id' is 14 (likely the "No Finding" or irrelevant class)
train_df = train_df[train_df.class_id!=14].reset_index(drop = True)

#  *Normalizing and Calculating Bounding Box Coordinates (YOLO Format Preparation)*

In [None]:
# ==============================================================================
# Bounding Box Normalization and Feature Engineering
# ==============================================================================

# Normalize coordinates by dividing by image width/height (converts pixels to 0-1 range)
train_df['x_min'] = train_df.apply(lambda row: (row.x_min)/row.width, axis =1)
train_df['y_min'] = train_df.apply(lambda row: (row.y_min)/row.height, axis =1)

# Normalize max coordinates
train_df['x_max'] = train_df.apply(lambda row: (row.x_max)/row.width, axis =1)
train_df['y_max'] = train_df.apply(lambda row: (row.y_max)/row.height, axis =1)

# Calculate normalized center coordinates (midpoint)
train_df['x_mid'] = train_df.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
train_df['y_mid'] = train_df.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

# Calculate normalized width (w) and height (h)
train_df['w'] = train_df.apply(lambda row: (row.x_max-row.x_min), axis =1)
train_df['h'] = train_df.apply(lambda row: (row.y_max-row.y_min), axis =1)

# Calculate the normalized bounding box area
train_df['area'] = train_df['w']*train_df['h']

# Check new normalized columns
train_df.head()

# *Feature and Target Variable Separation*

In [None]:
# Define the list of engineered features (normalized bounding box metrics)
features = ['x_min', 'y_min', 'x_max', 'y_max', 'x_mid', 'y_mid', 'w', 'h', 'area']

# Assign the feature columns to the input variable X (data used for prediction)
X = train_df[features]

# Assign the target column (class ID) to the output variable y (what we aim to predict)
y = train_df['class_id']

# Print the shapes of the resulting feature matrix and target vector to confirm dimensions
X.shape, y.shape

# *Extracting and Ordering Class Names*

In [None]:
# ==============================================================================
# Extract and Prepare Class Labels
# ==============================================================================

# Extract unique class IDs and names, creating two lists (class_ids, class_names)
class_ids, class_names = list(zip(*set(zip(train_df.class_id, train_df.class_name))))

# Sort the class names list based on the numerical class IDs
classes = list(np.array(class_names)[np.argsort(class_ids)])

# Ensure all class names are converted to string type
classes = list(map(lambda x: str(x), classes))

# Display the final, sorted list of class names
classes

# *Data Splitting*

In [None]:
# ==============================================================================
# GroupKFold Cross-Validation Setup
# ==============================================================================

# Initialize GroupKFold with 5 splits 
gkf  = GroupKFold(n_splits = 5)

# Initialize a new 'fold' column with a placeholder value of -1
train_df['fold'] = -1

# Iterate through the splits generated by GroupKFold
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups = train_df.image_id.tolist())):
    # Assign the current fold number to the 'fold' column for all validation indices (val_idx)
    # The 'groups' parameter ensures that all bounding boxes from the same image_id go to the same fold.
    train_df.loc[val_idx, 'fold'] = fold

# Display the first few rows to verify the 'fold' column has been populated
train_df.head()

# *Creating Train and Validation File Lists*

In [None]:
# ==============================================================================
# Separate Image Paths for Training and Validation Sets (Specific Fold)
# ==============================================================================

# Initialize empty lists to hold the paths for the training and validation images
train_files = []
val_files   = []

# Populate the validation list with unique image paths where the 'fold' column matches the current 'fold' variable (e.g., fold 0)
val_files += list(train_df[train_df.fold==fold].image_path.unique())

# Populate the training list with unique image paths where the 'fold' column does *not* match the current 'fold' variable
train_files += list(train_df[train_df.fold!=fold].image_path.unique())

# Print the total count of images in the resulting training and validation sets to confirm the split
len(train_files), len(val_files)

(3515, 879)

# *Setting Up Directory Structure and Copying Files for YOLO Training*

In [None]:
# ==============================================================================
# Create Output Directory Structure (YOLO Format)
# ==============================================================================

# Create directories for training labels and images in the current working directory
os.makedirs('/kaggle/working/vinbigdata/labels/train', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/labels/val', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/images/train', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/images/val', exist_ok = True)

# Define the source directory where pre-formatted YOLO label files (.txt) are located
label_dir = '/kaggle/input/vinbigdata-yolo-labels-dataset/labels'

# ==============================================================================
# Copy Files to Training Folders
# ==============================================================================

# Iterate through the list of training image file paths (train_files)
for file in tqdm(train_files):
    # Copy the image file to the working training images directory
    shutil.copy(file, '/kaggle/working/vinbigdata/images/train')
    
    # Extract the base filename (image_id) without extension
    filename = file.split('/')[-1].split('.')[0]
    
    # Copy the corresponding YOLO label file (.txt) to the working training labels directory
    shutil.copy(os.path.join(label_dir, filename+'.txt'), '/kaggle/working/vinbigdata/labels/train')
    
# ==============================================================================
# Copy Files to Validation Folders
# ==============================================================================
    
# Iterate through the list of validation image file paths (val_files)
for file in tqdm(val_files):
    # Copy the image file to the working validation images directory
    shutil.copy(file, '/kaggle/working/vinbigdata/images/val')
    
    # Extract the base filename (image_id)
    filename = file.split('/')[-1].split('.')[0]
    
    # Copy the corresponding YOLO label file (.txt) to the working validation labels directory
    shutil.copy(os.path.join(label_dir, filename+'.txt'), '/kaggle/working/vinbigdata/labels/val')

# *Install YOLO (Ultralytics) Library*

In [None]:
# Install the Ultralytics package, commonly used for YOLO object detection models
!pip install ultralytics

# *Verify Ultralytics Installation and Environment Checks*

In [None]:
# Import the newly installed package
import ultralytics

# Run a system check to verify that all dependencies are installed, environment is compatible, and GPU is accessible (if applicable)
ultralytics.checks()

# *Model Training and Finetuning*

In [None]:
# ==============================================================================
# Create YOLO Configuration and Path Files
# ==============================================================================
from os import listdir  # Import for listing directory contents (though not used in final code)
from os.path import isfile, join # Import for checking file status and joining paths
import yaml # Library for reading and writing YAML files

cwd = '/kaggle/working/' # Define the current working directory for output files

# Create train.txt: list of absolute paths to all training images
with open(join( cwd , 'train.txt'), 'w') as f:
    for path in glob('/kaggle/working/vinbigdata/images/train/*'):
        f.write(path+'\n')
            
# Create val.txt: list of absolute paths to all validation images
with open(join( cwd , 'val.txt'), 'w') as f:
    for path in glob('/kaggle/working/vinbigdata/images/val/*'):
        f.write(path+'\n')

# Define the YAML data structure for the YOLO configuration file
data = dict(
    train =  join( cwd , 'train.txt') , # Path to training image list
    val   =  join( cwd , 'val.txt' ),   # Path to validation image list
    nc    = 14,                         # Number of classes (excluding the filtered 'No Finding' class)
    names = classes                     # List of class names (from previous snippet)
    )

# Write the data dictionary to the vinbigdata.yaml file
with open(join( cwd , 'vinbigdata.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

# Read and print the contents of the generated YAML file for verification
f = open(join( cwd , 'vinbigdata.yaml'), 'r')
print('\nyaml:')
print(f.read())

# *Final Setup and Environment Verification*

In [None]:
import torch # Import the primary deep learning framework
from IPython.display import Image, clear_output  # Import utilities for displaying images and clearing notebook output

# Clear previous output in the notebook cell for a cleaner look
clear_output()

# Print confirmation of the PyTorch version and whether a GPU (CUDA device 0) is available or if it's falling back to CPU
print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

# *YOLO Model Training Execution*

In [None]:
# ==============================================================================
# Model Training Command (YOLOv8)
# ==============================================================================

# (Optional: Commented out) Shell command to set WANDB_MODE to "dryrun" and execute a custom training script (train_dual.py)
# # !WANDB_MODE="dryrun" python train_dual.py 

# Execute the YOLOv8 training command using the Ultralytics CLI:
!yolo train \
    model=yolov8n.pt \
    workers=8 \
    device=0 \
    batch=32 \
    data = /kaggle/working/vinbigdata.yaml \
    imgsz = 640 \
    epochs = 100
    
# Arguments Explained:
# model=yolov8n.pt:     Starts with a pre-trained YOLOv8 Nano model weights.
# workers=8:            Number of data loading workers (improves speed).
# device=0:             Specifies which GPU to use (device 0).
# batch=32:             Sets the batch size for training.
# data=...yaml:         Points to the configuration file created in the previous step.
# imgsz=640:            Sets the input image size to 640x640 pixels.
# epochs=100:           Defines the total number of training epochs.

# *Visualize Training Labels and Bounding Boxes*

In [None]:
# ==============================================================================
# Post-Training Visualization
# ==============================================================================
import matplotlib.pyplot as plt # Re-import/check if not done, for clarity in this block

# Initialize a figure for plotting with a large size (20x20 inches)
plt.figure(figsize = (20,20))

# Turn off the axis tick marks and labels for a clean image display
plt.axis('off')

# Read and display the 'labels.jpg' file generated by the YOLO training run,
# which typically summarizes the dataset's ground truth distribution or validation results.
plt.imshow(plt.imread('runs/detect/train/labels.jpg'));

# *Visualize Initial Training Batches*


In [None]:
# ==============================================================================
# Sanity Check: Visualize Initial Training Batches
# ==============================================================================

# Display the first training batch image (with ground truth labels overlaid)
plt.figure(figsize = (15, 15))
plt.imshow(plt.imread('runs/detect/train/train_batch0.jpg'))

# Display the second training batch image
plt.figure(figsize = (15, 15))
plt.imshow(plt.imread('runs/detect/train/train_batch1.jpg'))

# Display the third training batch image
plt.figure(figsize = (15, 15))
plt.imshow(plt.imread('runs/detect/train/train_batch2.jpg'))

# *Model Inference on Validation Data*

In [None]:
# ==============================================================================
# Model Inference (Prediction) Command
# ==============================================================================

# Execute the YOLO prediction command using the Ultralytics CLI:
!yolo predict \
    model = runs/detect/train/weights/best.pt \
    imgsz = 640 \
    conf = 0.25 \
    iou = 0.5 \
    source = /kaggle/working/vinbigdata/images/val

# Arguments Explained:
# model=.../best.pt:   Loads the weights of the best model saved during the training run.
# imgsz=640:           Sets the input image size to 640x640 pixels.
# conf=0.25:           Sets the minimum confidence threshold for a detection to be considered valid (25%).
# iou=0.5:             Sets the Non-Maximum Suppression (NMS) IoU threshold for filtering duplicate boxes (50%).
# source=.../val:      Specifies the directory containing the validation images to run predictions on.

# *Visualization of Sample Model Predictions* 

In [None]:
# ==============================================================================
# Visualize Model Prediction Results in a Grid
# ==============================================================================
import matplotlib.pyplot as plt # Core plotting library
from mpl_toolkits.axes_grid1 import ImageGrid # Specific tool for creating image grids
import numpy as np # Numerical operations
import random # For random sampling of files
import cv2 # OpenCV library for image loading and color conversion
from glob import glob # For finding file paths
from tqdm import tqdm # For displaying a progress bar

# Find all files (predicted images) in the YOLO prediction output directory
files = glob('runs/detect/predict/*')

# Run the visualization process three times to show different random samples
for _ in range(3):
    row = 4 # Define the number of rows in the grid
    col = 4 # Define the number of columns in the grid
    
    # Randomly select a sample of images equal to the grid size (4x4 = 16 images)
    grid_files = random.sample(files, row*col)
    images     = [] # List to hold the loaded image arrays
    
    # Load and convert the color space for each selected image
    for image_path in tqdm(grid_files):
        # Load image (BGR format by default) and convert to RGB
        img          = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
        images.append(img)

    # Initialize the main figure with a size proportional to the grid dimensions
    fig = plt.figure(figsize=(col*5, row*5))
    
    # Create the ImageGrid object for the subplot layout
    grid = ImageGrid(fig, 111,  # Standard subplot position
                     nrows_ncols=(col, row),  # Define grid size
                     axes_pad=0.05,  # Minimal padding between images
                     )

    # Populate the grid: plot each image onto its corresponding axis
    for ax, im in zip(grid, images):
        ax.imshow(im)
        ax.set_xticks([]) # Hide x-axis ticks
        ax.set_yticks([]) # Hide y-axis ticks
        
    plt.show() # Display the final image grid