In [1]:
import glob
import json
import os
import shutil

In [2]:
import torch
import ultralytics
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
def train_model(dataset_yaml, model_path="yolov8x", img_size=640, batch_size=32, epochs=50, output_dir="/kaggle/working"):
    """
    Train a YOLOv8x model on the road damage dataset

    Args:
        dataset_yaml: Path to the dataset yaml file
        model_path: Base model name (e.g., "yolov8x") or path to a .pt file
        epochs: Number of training epochs
        img_size: Image size for training
        batch_size: Batch size for training
        output_dir: Directory to save outputs
    """
    # Set up GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # Model path for outputs
    weights_dir = os.path.join(output_dir, "weights")
    os.makedirs(weights_dir, exist_ok=True)

    # Initialize model
    model = YOLO(model_path)

    # Save training configuration for future reference
    config = {
        "dataset_yaml": dataset_yaml,
        "model_path": model_path,
        "epochs": epochs,
        "img_size": img_size,
        "batch_size": batch_size,
        "output_dir": output_dir,
        "device": str(device),
        "accelerator": "gpu",
    }

    config_path = os.path.join(output_dir, "training_config.json")
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=4)

    # Prepare training arguments
    train_args = {
        'data': dataset_yaml,
        'epochs': epochs,
        'imgsz': img_size,
        'batch': batch_size,
        'project': output_dir,
        'name': "road_damage_detector",
        'verbose': True,
        'workers': 2,  # More workers for data loading
        'cache': False,  # Cache data for faster training
        'device': str(device)
    }

    # Train the model
    model.train(**train_args)

    # Get the path to the best model
    best_model_path = os.path.join(output_dir, "road_damage_detector", "weights", "best.pt")

    # Save a copy to a more accessible location
    if os.path.exists(best_model_path):
        final_model_path = os.path.join(weights_dir, "road_damage_model.pt")
        shutil.copy(best_model_path, final_model_path)
        print(f"Best model saved to {final_model_path}")

        # Save training checkpoint information
        checkpoint_info = {
            "model_path": final_model_path,
            "training_completed": True,
            "epochs_completed": epochs
        }

        checkpoint_file = os.path.join(output_dir, "training_checkpoint.json")
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint_info, f, indent=4)

        return final_model_path
    else:
        print(f"Warning: Best model not found at {best_model_path}")
        # Try to find any model weights that might have been saved
        model_files = glob.glob(os.path.join(output_dir, "road_damage_detector/weights/*.pt"))
        if model_files:
            latest_model = sorted(model_files, key=os.path.getmtime)[-1]
            print(f"Found alternative model file: {latest_model}")
            final_model_path = os.path.join(weights_dir, "road_damage_model.pt")
            shutil.copy(latest_model, final_model_path)

            # Save partial training checkpoint
            checkpoint_info = {
                "model_path": final_model_path,
                "model_source": latest_model,
                "training_completed": False
            }

            checkpoint_file = os.path.join(output_dir, "training_checkpoint.json")
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint_info, f, indent=4)

            return final_model_path

        # Save failed training information
        checkpoint_info = {
            "model_path": None,
            "training_completed": False,
            "error": "No model weights found"
        }

        checkpoint_file = os.path.join(output_dir, "training_checkpoint.json")
        with open(checkpoint_file, 'w') as f:
            json.dump(checkpoint_info, f, indent=4)

        return None


# Paths
model_path = "/kaggle/input/train-rdd2022/weights/road_damage_model.pt"
dataset_yaml = "/kaggle/input/rdd-2022/RDD-2022/dataset.yaml"
work_dir = "/kaggle/working/"

# Train the model
model_path = train_model(
    dataset_yaml=dataset_yaml,
    model_path=model_path,  # Base model name or path
    output_dir=work_dir,
    batch_size=16,
    img_size=640,
    epochs=20
)

Using device: cuda
Ultralytics 8.3.100 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=/kaggle/input/train-rdd2022/weights/road_damage_model.pt, data=/kaggle/input/rdd-2022/RDD-2022/dataset.yaml, epochs=20, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cuda, workers=2, project=/kaggle/working/, name=road_damage_detector, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, e

100%|██████████| 755k/755k [00:00<00:00, 34.9MB/s]



                   from  n    params  module                                       arguments                     
  0                  -1  1      2320  ultralytics.nn.modules.conv.Conv             [3, 80, 3, 2]                 
  1                  -1  1    115520  ultralytics.nn.modules.conv.Conv             [80, 160, 3, 2]               
  2                  -1  3    436800  ultralytics.nn.modules.block.C2f             [160, 160, 3, True]           
  3                  -1  1    461440  ultralytics.nn.modules.conv.Conv             [160, 320, 3, 2]              
  4                  -1  6   3281920  ultralytics.nn.modules.block.C2f             [320, 320, 6, True]           
  5                  -1  1   1844480  ultralytics.nn.modules.conv.Conv             [320, 640, 3, 2]              
  6                  -1  6  13117440  ultralytics.nn.modules.block.C2f             [640, 640, 6, True]           
  7                  -1  1   3687680  ultralytics.nn.modules.conv.Conv             [640

100%|██████████| 5.35M/5.35M [00:00<00:00, 164MB/s]


[34m[1mAMP: [0mchecks passed ✅


[34m[1mtrain: [0mScanning /kaggle/input/rdd-2022/RDD-2022/train/labels... 12734 images, 0 backgrounds, 0 corrupt: 100%|██████████| 12734/12734 [01:00<00:00, 210.71it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


  check_for_updates()
[34m[1mval: [0mScanning /kaggle/input/rdd-2022/RDD-2022/test/labels... 3194 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3194/3194 [00:15<00:00, 208.57it/s]


Plotting labels to /kaggle/working/road_damage_detector/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.00125, momentum=0.9) with parameter groups 97 weight(decay=0.0), 104 weight(decay=0.0005), 103 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1m/kaggle/working/road_damage_detector[0m
Starting training for 20 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/20      13.6G      1.755      1.994      1.742         41        640: 100%|██████████| 796/796 [17:58<00:00,  1.35s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:33<00:00,  1.08it/s]


                   all       3194       6900      0.418      0.387      0.355      0.155

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/20      13.7G      1.829      2.137      1.794         54        640: 100%|██████████| 796/796 [17:48<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.402      0.378      0.333      0.142

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/20      13.7G       1.85      2.156      1.804         54        640: 100%|██████████| 796/796 [17:46<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.428      0.385      0.369      0.165

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/20      13.7G      1.836       2.14       1.79         41        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.454      0.391      0.377      0.171

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/20      13.7G      1.799      2.063      1.752         44        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.451      0.442      0.413      0.184

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/20      13.7G      1.783      2.015       1.74         47        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.496       0.44      0.433      0.202

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/20      13.7G      1.758       1.97      1.722         57        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.481      0.419      0.406      0.188

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/20      13.7G      1.725      1.917      1.693         40        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.511       0.46      0.457      0.216

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/20      13.7G      1.715      1.883      1.691         42        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.528      0.476      0.479      0.231

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/20      13.7G      1.706      1.854      1.686         49        640: 100%|██████████| 796/796 [17:45<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.533      0.487      0.492      0.233
Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/20      13.7G      1.707      1.809      1.734         30        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.539      0.498      0.505      0.243

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/20      13.7G       1.69      1.751      1.719         33        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.10it/s]


                   all       3194       6900      0.547      0.508      0.515      0.248

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/20      13.7G      1.673      1.714       1.71         28        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900      0.542      0.521      0.525      0.256

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/20      13.7G      1.643      1.668      1.696         28        640: 100%|██████████| 796/796 [17:45<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


                   all       3194       6900       0.57      0.521      0.539      0.264

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/20      13.7G      1.629      1.624      1.682         36        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.10it/s]


                   all       3194       6900      0.579       0.53      0.557      0.275

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      16/20      13.7G      1.612      1.582      1.676         34        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]

                   all       3194       6900      0.573       0.54       0.56       0.28






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      17/20      13.7G      1.592      1.546      1.658         19        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.10it/s]

                   all       3194       6900       0.59      0.547      0.573      0.284






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      18/20      13.7G      1.574      1.503      1.641         33        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]

                   all       3194       6900      0.587      0.559      0.579      0.289






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      19/20      13.7G      1.552      1.472      1.634         37        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.10it/s]

                   all       3194       6900      0.596      0.564      0.586      0.294






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      20/20      13.7G      1.536      1.442      1.615         26        640: 100%|██████████| 796/796 [17:44<00:00,  1.34s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:31<00:00,  1.09it/s]

                   all       3194       6900      0.605       0.56      0.589      0.299






20 epochs completed in 6.443 hours.
Optimizer stripped from /kaggle/working/road_damage_detector/weights/last.pt, 136.7MB
Optimizer stripped from /kaggle/working/road_damage_detector/weights/best.pt, 136.7MB

Validating /kaggle/working/road_damage_detector/weights/best.pt...
Ultralytics 8.3.100 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
Model summary (fused): 112 layers, 68,127,420 parameters, 0 gradients, 257.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 100/100 [01:34<00:00,  1.06it/s]


                   all       3194       6900      0.606       0.56      0.589      0.299
                   D00       1541       2457        0.6      0.634       0.64      0.351
                   D10        964       1537      0.555      0.523      0.526      0.247
                   D20       1425       1794      0.666      0.609      0.666      0.363
                   D40        594       1112      0.602      0.473      0.524      0.234


  xa[xa < 0] = -1
  xa[xa < 0] = -1


Speed: 0.2ms preprocess, 26.0ms inference, 0.0ms loss, 0.8ms postprocess per image
Results saved to [1m/kaggle/working/road_damage_detector[0m
Best model saved to /kaggle/working/weights/road_damage_model.pt


In [4]:
print(model_path)

/kaggle/working/weights/road_damage_model.pt
