# Road Scene Understanding with Kitti Dataset
#### CS 5190
#### Team Members: 

### SETUP
1. Download dataset from https://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=2d
2. Set up Notebook environment
`> conda install matplotlib numpy opencv pandas scikit-image scikit-learn scipy ultralytics opencv-python tqdm pillow`
3. Create folder for yolov8

Each line in a label file contains the following:
`<object_type> <truncation> <occlusion> <alpha> <left> <top> <right> <bottom> <height> <width> <length> <x> <y> <z> <rotation_y>`

YOLOV8 requires format of 'class x_center y_center width height' -> for each image has one txt file with a single line for each bounding box.
Structure for yolov8: https://roboflow.com/formats/yolov8-pytorch-txt

# Dataset

In [3]:
# KITTI BASE PATHS
base_dataset_path = "..\\dataset\\"
base_yolo_path = "..\\yolov8\\" #not using this one anymore
base_labels_path = "..\\data_object_label_2\\training\\label_2\\"
base_images_train_path = "..\\data_object_image_2\\training\\image_2\\"

# KITTI CLASSES/OBJECT types in the labels file
OBJECT_CLASSES = {'Car': 0, 'Van': 1, 'Truck': 2, 'Pedestrian': 3, 'Person_sitting': 4, 'Cyclist': 5, 'Tram': 6, 'Misc': 7, 'DontCare': 8}

LABEL_FILE_FIELDS = [
        'label', 'truncated', 'occluded', 'alpha',
        'bbox_xmin', 'bbox_ymin', 'bbox_xmax',
        'bbox_ymax', 'dim_height', 'dim_width', 'dim_length',
        'loc_x', 'loc_y', 'loc_z', 'rotation_y', 'score'
    ]

In [9]:
import os
from PIL import Image

#create pairs for the paths to KITTI images and labels
path_pairs = []
if not os.path.isdir(base_labels_path):
    print(f"Error: Folder {base_labels_path} not found")
elif not os.path.isdir(base_images_train_path):
    print(f"Error: Folder {base_images_train_path} not found")
else:
    for full_filename in os.listdir(base_labels_path):
        filename = full_filename.split('.')
        pair = {"img_path": (base_images_train_path + filename[0] + ".png"), "label_path": (base_labels_path + filename[0] + ".txt") }
        path_pairs.append(pair)

In [10]:
#TODO: Might want to implement train split later on
from sklearn.model_selection import train_test_split

#seperate into 80% training and 20% validation
train, validate = train_test_split(path_pairs, test_size=0.2, shuffle=True)


In [18]:
train[:2]

[{'img_path': '..\\data_object_image_2\\training\\image_2\\000747.png',
  'label_path': '..\\data_object_label_2\\training\\label_2\\000747.txt'},
 {'img_path': '..\\data_object_image_2\\training\\image_2\\006950.png',
  'label_path': '..\\data_object_label_2\\training\\label_2\\006950.txt'}]

In [19]:
len(train)

374

## Preprocessing

FILE STRUCTURE

> dataset/
> |---train/
> |     |---images/
> |     |   |---000001.png   
> |     |   |---000001.png   
> |     |   |---...cont
> |     |---labels/
> |        |---000001.txt   
> |        |---000001.txt   
> |        |---...cont   
> |---val/
> |     |---images/
> |     |   |---000201.png   
> |     |   |---000201.png   
> |     |   |---...cont
> |     |---labels/
> |        |---000201.txt   
> |        |---000201.txt   
> |        |---...cont   
> |---kitti.yaml

In [19]:
#function that takes in bounding box coordinates from the label file and image width&height 
#return x_center, y_center, width, and height
def convert_bbox_yolo8(img_w, img_h, x1 , y1, x2, y2):
    x_center = ((x1 + x2) / 2 ) / img_w
    y_center = ((y1 + y2) / 2 ) / img_h
    width = (x2 - x1) / img_w
    height = (y2 - y1) / img_h

    return x_center, y_center, width, height

In [22]:
#this one takes around 4mins to run
def preprocess(path_pairs, stage):
    for curr_pair in path_pairs:
        #set yolo file
        # yolo_path = base_yolo_path + os.path.basename(curr_pair['label_path'])
        yolo_label_path = base_dataset_path + f"//{stage}//" + "labels//" + os.path.basename(curr_pair['label_path'])
        insert_img_path = base_dataset_path + f"//{stage}//" + "images//"+ os.path.basename(curr_pair['img_path'])

        #open image to get width & height
        try:
            img = Image.open(curr_pair['img_path'])
            img_width, img_height = img.size
        except Exception as e:
            print(f"Error opening image {curr_pair['img_path']}: {e}")
            continue
        
        #loop through label file line by line to add to yolo .txt file
        with open(curr_pair['label_path']) as f:
            lines = f.readlines()

        yolo_lines = []
        for line in lines:
            label_parts = line.strip().split()
            object_name = label_parts[0]
            class_id = OBJECT_CLASSES[object_name]
            if class_id == 8 or class_id == 7: 
                continue # excluding the DontCare & Misc bounding boxes
            bb_x1, bb_y1, bb_x2, bb_y2 = map(float, label_parts[4:8])   #taking original bounding box coordinates from kitti label
            x_center, y_center, width, height = convert_bbox_yolo8(img_width, img_height, bb_x1 , bb_y1, bb_x2, bb_y2) 
            yolo_lines.append(f"{class_id} {x_center} {y_center} {width} {height}\n") #yolo format

        #insert image
        img.save(insert_img_path)
        #create and write yolo normalized bounding box
        with open(yolo_label_path, "w") as out:
            out.writelines(yolo_lines)

In [23]:
def clean_folder(path):
    if not os.path.exists(path):
        print(f"Error: Directory {path} DNE")
        return
    
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath):
            try:
                os.remove(filepath)
            except OSError as e:
                print(f"Error removing file {filepath}: {e}")

In [None]:
#clean dataset folder
clean_folder(base_dataset_path + "train//labels//")
clean_folder(base_dataset_path + "train//images//")
clean_folder(base_dataset_path + "val//labels//")
clean_folder(base_dataset_path + "val//images//")
try:
    os.remove(base_dataset_path + "kitti.yaml")
except OSError as e:
    print(f"Error removing file {base_dataset_path + "kitti.yaml"}: {e}")

In [None]:

#testing just a few
preprocess(train[:100], "train")
preprocess(validate[:20], "val")

## Model Training

In [28]:
#Create YAML for YOLOv8

kitti_yaml = f"""
path: {str(base_dataset_path)}
train: train/images
val: val/images
nc: 7
names:
    0: Car 
    1: Van
    2: Truck
    3: Pedestrian
    4: Person_sitting
    5: Cyclist
    6: Tram 
"""

with open(base_dataset_path + "kitti.yaml", "w") as out:
    out.writelines(kitti_yaml)

In [4]:
from ultralytics import YOLO

#Load a pretrained model
model = YOLO("yolov8n.pt")

#Train the model
model.train(
    data=(base_dataset_path + "kitti.yaml"), 
    epochs=20,
    imgsz=640,
    batch=8,
    name="yolo_kitti"
)

New https://pypi.org/project/ultralytics/8.3.230 available  Update with 'pip install -U ultralytics'
Ultralytics 8.3.228  Python-3.14.0 torch-2.9.1+cpu CPU (AMD Ryzen 7 5800H with Radeon Graphics)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=..\dataset\kitti.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=20, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolo_kitti2, nbs=64,

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0, 1, 2, 3, 4, 5, 6])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x00000148C33B2A30>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047

## Testing

In [5]:
results = model.predict(source="..\\data_object_image_2\\training\\sample", save=True, conf=0.3)


image 1/5 c:\Users\Sabrina Ferras\OneDrive - csumb.edu\Documents\CPP\4Fall2025\CS_5190\finalProject\code\..\data_object_image_2\training\sample\000031.png: 224x640 6 Cars, 69.3ms
image 2/5 c:\Users\Sabrina Ferras\OneDrive - csumb.edu\Documents\CPP\4Fall2025\CS_5190\finalProject\code\..\data_object_image_2\training\sample\000032.png: 224x640 6 Cars, 45.9ms
image 3/5 c:\Users\Sabrina Ferras\OneDrive - csumb.edu\Documents\CPP\4Fall2025\CS_5190\finalProject\code\..\data_object_image_2\training\sample\000039.png: 224x640 5 Cars, 52.0ms
image 4/5 c:\Users\Sabrina Ferras\OneDrive - csumb.edu\Documents\CPP\4Fall2025\CS_5190\finalProject\code\..\data_object_image_2\training\sample\000064.png: 224x640 2 Cars, 46.1ms
image 5/5 c:\Users\Sabrina Ferras\OneDrive - csumb.edu\Documents\CPP\4Fall2025\CS_5190\finalProject\code\..\data_object_image_2\training\sample\000074.png: 224x640 1 Van, 5 Pedestrians, 2 Cyclists, 35.5ms
Speed: 3.6ms preprocess, 49.8ms inference, 1.6ms postprocess per image at shap

## Visualizations