In [1]:
!pip install ultralytics

import cv2
import os
import dlib
import torch
import torch.nn as nn
import subprocess
import numpy as np
from sklearn.preprocessing import LabelEncoder
from ultralytics import YOLO
from tqdm import tqdm

Collecting ultralytics
  Downloading ultralytics-8.3.228-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [2]:
device_str = "cuda" if torch.cuda.is_available() else "cpu"
model = YOLO("yolo11n.pt").to(device_str)
model.model = model.model.eval().to(device_str)
model_face = YOLO("/kaggle/input/yolo-face-model/yolov11n-face.pt").to(device_str)
predictor = dlib.shape_predictor('/kaggle/input/dlib-68-face-predictor/shape_predictor_68_face_landmarks.dat')

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt': 100% ━━━━━━━━━━━━ 5.4MB 67.5MB/s 0.1s


---

# Feature Extraction

In [4]:
def get_rotation(video_path):
    try:
        result = subprocess.run([
            'ffprobe', '-v', 'error',
            '-select_streams', 'v:0',
            '-show_entries', 'stream_tags=rotate',
            '-of', 'default=nokey=1:noprint_wrappers=1',
            video_path
        ], capture_output=True, text=True)
        return int(result.stdout.strip())
    except:
        return 0

def rotate_frame(frame, degrees):
    if degrees == 90:
        return cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
    elif degrees == 180:
        return cv2.rotate(frame, cv2.ROTATE_180)
    elif degrees == 270:
        return cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
    return frame

In [5]:
def calculate_EAR(eye_points):
    A = np.linalg.norm(eye_points[1] - eye_points[5])
    B = np.linalg.norm(eye_points[2] - eye_points[4])
    C = np.linalg.norm(eye_points[0] - eye_points[3])
    ear = (A + B) / (2.0 * C)
    return ear


def calculate_MAR(mouth_points):
    A = np.linalg.norm(mouth_points[1] - mouth_points[7])
    B = np.linalg.norm(mouth_points[2] - mouth_points[6])
    C = np.linalg.norm(mouth_points[3] - mouth_points[5])
    D = np.linalg.norm(mouth_points[0] - mouth_points[4])
    mar = (A + B + C) / (2.0 * D)
    return mar


def calculate_HPE(landmarks, frame_shape):
    h, w = frame_shape[:2]

    image_points = np.array([
        landmarks[30],
        landmarks[8],
        landmarks[36],
        landmarks[45],
        landmarks[48],
        landmarks[54]
    ], dtype="double")

    model_points = np.array([
        (0.0, 0.0, 0.0),
        (0.0, -330.0, -65.0),
        (-225.0, 170.0, -135.0),
        (225.0, 170.0, -135.0),
        (-150.0, -150.0, -125.0),
        (150.0, -150.0, -125.0)
    ])

    # Camera internals
    focal_length = w
    center = (w / 2, h / 2)
    camera_matrix = np.array([
        [focal_length, 0, center[0]],
        [0, focal_length, center[1]],
        [0, 0, 1]
    ], dtype="double")

    dist_coeffs = np.zeros((4, 1))

    success, rotation_vector, translation_vector = cv2.solvePnP(
        model_points, image_points, camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_ITERATIVE
    )

    if not success:
        return np.nan, np.nan, np.nan

    rotation_mat, _ = cv2.Rodrigues(rotation_vector)
    proj_mat = np.hstack((rotation_mat, translation_vector))
    _, _, _, _, _, _, euler_angles = cv2.decomposeProjectionMatrix(proj_mat)

    pitch, yaw, roll = euler_angles.flatten().astype(float)
    return pitch, yaw, roll

In [6]:
def get_yolo_features(frame):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = torch.from_numpy(img).float().permute(2, 0, 1).unsqueeze(0).to(device_str) / 255.0
    with torch.no_grad():
        # Forward through YOLO backbone only
        backbone_out = model.model.model[:10](img)
        pooled = torch.nn.functional.adaptive_avg_pool2d(backbone_out[-1], (1, 1))
        feats = pooled.view(pooled.size(0), -1).squeeze(0).cpu().numpy()
        return feats

In [7]:
def get_features(current_data, current_num, total_num):
    features = []
    yolo_features = []
    vid = cv2.VideoCapture(current_data)
    rotation = get_rotation(current_data)
    num_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))

    for _ in tqdm(range(num_frames), desc=f"{current_num}/{total_num}"):
        ret, frame = vid.read()
        if not ret or frame is None:
            continue
        frame = rotate_frame(frame, rotation)
        if frame is None:
            continue
        pred = model_face(frame, device=device_str, verbose=False)[0]
        if len(pred) != 0:
            for i, box in enumerate(pred.boxes):
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                rect = dlib.rectangle(left=x1, top=y1, right=x2, bottom=y2)
                face_landmark = predictor(frame, rect)
                break

                
            img = frame[y1:y2+1, x1:x2+1]
            yolo_features.append(get_yolo_features(img))
            
            landmarks = np.array([[face_landmark.part(i).x, face_landmark.part(i).y] for i in range(68)])
            
            left_eye = landmarks[36:42]
            right_eye = landmarks[42:48]
            ear = (calculate_EAR(left_eye) + calculate_EAR(right_eye)) / 2.0
            
            mouth = landmarks[60:68]
            mar = calculate_MAR(mouth)
            
            pitch, yaw, roll = calculate_HPE(landmarks, frame.shape)
            features.append([ear, mar, pitch, yaw, roll])
        else:
            features.append([np.nan]*5)
            yolo_features.append([np.nan]*256)
    return features, yolo_features

In [3]:
dataX = [[] for _ in range(5)]
dataY = [[] for _ in range(5)]


for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold1_part1/Fold1_part1'):
    for filename in filenames:
        dataX[0].append(os.path.join(dirname, filename))
        dataY[0].append(filename.split(".")[0])

for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold1_part2/Fold1_part2'):
    for filename in filenames:
        dataX[0].append(os.path.join(dirname, filename))
        dataY[0].append(filename.split(".")[0])


for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold2_part1/Fold2_part1'):
    for filename in filenames:
        dataX[1].append(os.path.join(dirname, filename))
        dataY[1].append(filename.split(".")[0])

for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold2_part2/Fold2_part2'):
    for filename in filenames:
        dataX[1].append(os.path.join(dirname, filename))
        dataY[1].append(filename.split(".")[0])


for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold3_part1/Fold3_part1'):
    for filename in filenames:
        dataX[2].append(os.path.join(dirname, filename))
        dataY[2].append(filename.split(".")[0])

f3_spCase_x = []
for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold3_part2/Fold3_part2'):
    for filename in filenames:
        if filename.split(".")[0] == "10_1":
            f3_spCase_x.append(os.path.join(dirname, filename))
        elif filename.split(".")[0] == "10_2":
            f3_spCase_x.append(os.path.join(dirname, filename))
            dataX[2].append(f3_spCase_x)
            dataY[2].append("10")
        else:
            dataX[2].append(os.path.join(dirname, filename))
            dataY[2].append(filename.split(".")[0])


for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold4_part1/Fold4_part1'):
    for filename in filenames:
        dataX[3].append(os.path.join(dirname, filename))
        dataY[3].append(filename.split(".")[0])

for dirname, _, filenames in os.walk('/kaggle/input/uta-reallife-drowsiness-dataset/Fold4_part2/Fold4_part2'):
    for filename in filenames:
        dataX[3].append(os.path.join(dirname, filename))
        dataY[3].append(filename.split(".")[0])


f5_spCase_x = []
for dirname, _, filenames in os.walk('/kaggle/input/uta-rldd-fold5/Fold5_part1/Fold5_part1'):
    for filename in filenames:
        if filename.split(".")[0] == "10_1":
            f5_spCase_x.append(os.path.join(dirname, filename))
        elif filename.split(".")[0] == "10_2":
            f5_spCase_x.append(os.path.join(dirname, filename))
            dataX[4].append(f5_spCase_x)
            dataY[4].append("10")
        else:
            dataX[4].append(os.path.join(dirname, filename))
            dataY[4].append(filename.split(".")[0])

for dirname, _, filenames in os.walk('/kaggle/input/uta-rldd-fold5/Fold5_part2/Fold5_part2'):
    for filename in filenames:
        dataX[4].append(os.path.join(dirname, filename))
        dataY[4].append(filename.split(".")[0])



le = LabelEncoder().fit(["0", "5", "10"])
dataY = [le.transform(d) for d in dataY]


print(len(dataX), len(dataY), ":")
print(len(dataX[0]), len(dataY[0]))
print(len(dataX[1]), len(dataY[1]))
print(len(dataX[2]), len(dataY[2]))
print(len(dataX[3]), len(dataY[3]))
print(len(dataX[4]), len(dataY[4]))

5 5 :
36 36
36 36
36 36
36 36
36 36


In [27]:
i = 0
total_num = len(dataX[i])
labels = np.array(dataY[i])

all_features = []
all_yolo_features = []
lengths = []

for j in range(total_num):
    feats = []
    yolo_feats = []
    if isinstance(dataX[i][j], list):
        for k in range(len(dataX[i][j])):
            temp_feats, temp_yolo_feats = get_features(dataX[i][j][k], j+1, total_num)
            feats.extend(temp_feats)
            yolo_feats.extend(temp_yolo_feats)
    else:
        feats, yolo_feats = get_features(dataX[i][j], j+1, total_num)
    all_features.append(feats)
    all_yolo_features.append(yolo_feats)
    lengths.append(len(feats))

all_features = np.array(all_features, dtype="object")
all_yolo_features = np.array(all_yolo_features, dtype="object")
lengths=np.array(lengths)

np.savez(f"/kaggle/working/fold_{i+1}", features=all_features, yolo_features=all_yolo_features, lengths=lengths, labels=labels)

1/36: 100%|██████████| 18050/18050 [08:00<00:00, 37.57it/s]
2/36: 100%|██████████| 18410/18410 [08:05<00:00, 37.91it/s]
3/36: 100%|██████████| 18062/18062 [07:45<00:00, 38.78it/s]
4/36: 100%|██████████| 13117/13117 [06:11<00:00, 35.29it/s]
5/36: 100%|██████████| 10233/10233 [04:51<00:00, 35.11it/s]
6/36: 100%|██████████| 15230/15230 [07:10<00:00, 35.35it/s]
7/36: 100%|██████████| 18124/18124 [09:32<00:00, 31.68it/s]
8/36: 100%|██████████| 18263/18263 [08:37<00:00, 35.29it/s]
9/36: 100%|██████████| 18103/18103 [09:37<00:00, 31.37it/s]
10/36: 100%|██████████| 18224/18224 [07:33<00:00, 40.18it/s]
11/36: 100%|██████████| 15394/15394 [05:56<00:00, 43.13it/s]
12/36: 100%|██████████| 15364/15364 [05:53<00:00, 43.50it/s]
13/36: 100%|██████████| 18480/18480 [07:44<00:00, 39.82it/s]
14/36: 100%|██████████| 20763/20763 [10:54<00:00, 31.72it/s]
15/36: 100%|██████████| 18514/18514 [07:45<00:00, 39.73it/s]
16/36: 100%|██████████| 18011/18011 [08:31<00:00, 35.22it/s]
17/36: 100%|██████████| 18053/180

In [8]:
i = 1
total_num = len(dataX[i])
labels = np.array(dataY[i])

all_features = []
all_yolo_features = []
lengths = []

for j in range(total_num):
    feats = []
    yolo_feats = []
    if isinstance(dataX[i][j], list):
        for k in range(len(dataX[i][j])):
            temp_feats, temp_yolo_feats = get_features(dataX[i][j][k], j+1, total_num)
            feats.extend(temp_feats)
            yolo_feats.extend(temp_yolo_feats)
    else:
        feats, yolo_feats = get_features(dataX[i][j], j+1, total_num)
    all_features.append(feats)
    all_yolo_features.append(yolo_feats)
    lengths.append(len(feats))

all_features = np.array(all_features, dtype="object")
all_yolo_features = np.array(all_yolo_features, dtype="object")
lengths=np.array(lengths)

np.savez(f"/kaggle/working/fold_{i+1}", features=all_features, yolo_features=all_yolo_features, lengths=lengths, labels=labels)

1/36: 100%|██████████| 18245/18245 [07:52<00:00, 38.65it/s]
2/36: 100%|██████████| 18212/18212 [06:01<00:00, 50.33it/s]
3/36: 100%|██████████| 18121/18121 [05:56<00:00, 50.86it/s]
4/36: 100%|██████████| 7676/7676 [03:02<00:00, 42.13it/s]
5/36: 100%|██████████| 11672/11672 [04:34<00:00, 42.54it/s]
6/36: 100%|██████████| 8975/8975 [03:31<00:00, 42.45it/s]
7/36: 100%|██████████| 14491/14491 [05:31<00:00, 43.70it/s]
8/36: 100%|██████████| 11357/11357 [04:19<00:00, 43.72it/s]
9/36: 100%|██████████| 14488/14488 [05:29<00:00, 43.97it/s]
10/36: 100%|██████████| 20477/20477 [10:02<00:00, 34.01it/s]
11/36: 100%|██████████| 18486/18486 [08:57<00:00, 34.37it/s]
12/36: 100%|██████████| 18236/18236 [08:52<00:00, 34.23it/s]
13/36: 100%|██████████| 18541/18541 [07:06<00:00, 43.50it/s]
14/36: 100%|██████████| 18483/18483 [07:05<00:00, 43.45it/s]
15/36: 100%|██████████| 19301/19301 [07:23<00:00, 43.51it/s]
16/36: 100%|██████████| 19742/19742 [07:33<00:00, 43.51it/s]
17/36: 100%|██████████| 18692/18692 [

In [9]:
i = 2
total_num = len(dataX[i])
labels = np.array(dataY[i])

all_features = []
all_yolo_features = []
lengths = []

for j in range(total_num):
    feats = []
    yolo_feats = []
    if isinstance(dataX[i][j], list):
        for k in range(len(dataX[i][j])):
            temp_feats, temp_yolo_feats = get_features(dataX[i][j][k], j+1, total_num)
            feats.extend(temp_feats)
            yolo_feats.extend(temp_yolo_feats)
    else:
        feats, yolo_feats = get_features(dataX[i][j], j+1, total_num)
    all_features.append(feats)
    all_yolo_features.append(yolo_feats)
    lengths.append(len(feats))

all_features = np.array(all_features, dtype="object")
all_yolo_features = np.array(all_yolo_features, dtype="object")
lengths=np.array(lengths)

np.savez(f"/kaggle/working/fold_{i+1}", features=all_features, yolo_features=all_yolo_features, lengths=lengths, labels=labels)

1/36: 100%|██████████| 18627/18627 [06:50<00:00, 45.34it/s]
2/36: 100%|██████████| 19727/19727 [07:22<00:00, 44.62it/s]
3/36: 100%|██████████| 18774/18774 [07:09<00:00, 43.66it/s]
4/36: 100%|██████████| 18016/18016 [08:51<00:00, 33.93it/s]
5/36: 100%|██████████| 17987/17987 [08:43<00:00, 34.39it/s]
6/36: 100%|██████████| 17992/17992 [08:55<00:00, 33.57it/s]
7/36: 100%|██████████| 14938/14938 [05:53<00:00, 42.23it/s]
8/36: 100%|██████████| 18061/18061 [07:00<00:00, 42.94it/s]
9/36: 100%|██████████| 14512/14512 [05:49<00:00, 41.56it/s]
10/36: 100%|██████████| 26445/26445 [11:58<00:00, 36.82it/s]
11/36: 100%|██████████| 20849/20849 [09:53<00:00, 35.11it/s]
12/36: 100%|██████████| 18489/18489 [07:18<00:00, 42.19it/s]
13/36: 100%|██████████| 20846/20846 [08:26<00:00, 41.14it/s]
14/36: 100%|██████████| 18251/18251 [08:47<00:00, 34.63it/s]
15/36: 100%|██████████| 20439/20439 [08:39<00:00, 39.35it/s]
16/36: 100%|██████████| 8173/8173 [03:08<00:00, 43.36it/s]
17/36: 100%|██████████| 10714/10714

In [8]:
i = 3
total_num = len(dataX[i])
labels = np.array(dataY[i])

all_features = []
all_yolo_features = []
lengths = []

for j in range(total_num):
    feats = []
    yolo_feats = []
    if isinstance(dataX[i][j], list):
        for k in range(len(dataX[i][j])):
            temp_feats, temp_yolo_feats = get_features(dataX[i][j][k], j+1, total_num)
            feats.extend(temp_feats)
            yolo_feats.extend(temp_yolo_feats)
    else:
        feats, yolo_feats = get_features(dataX[i][j], j+1, total_num)
    all_features.append(feats)
    all_yolo_features.append(yolo_feats)
    lengths.append(len(feats))

all_features = np.array(all_features, dtype="object")
all_yolo_features = np.array(all_yolo_features, dtype="object")
lengths=np.array(lengths)

np.savez(f"/kaggle/working/fold_{i+1}", features=all_features, yolo_features=all_yolo_features, lengths=lengths, labels=labels)

1/36: 100%|██████████| 10777/10777 [04:45<00:00, 37.81it/s]
2/36: 100%|██████████| 28325/28325 [15:49<00:00, 29.83it/s]
3/36: 100%|██████████| 18840/18840 [07:25<00:00, 42.27it/s]
4/36: 100%|██████████| 18158/18158 [08:33<00:00, 35.34it/s]
5/36: 100%|██████████| 19376/19376 [09:14<00:00, 34.92it/s]
6/36: 100%|██████████| 18173/18173 [08:29<00:00, 35.67it/s]
7/36: 100%|██████████| 18097/18097 [07:42<00:00, 39.09it/s]
8/36: 100%|██████████| 18062/18062 [09:18<00:00, 32.32it/s]
9/36: 100%|██████████| 18056/18056 [09:34<00:00, 31.41it/s]
10/36: 100%|██████████| 18036/18036 [07:49<00:00, 38.45it/s]
11/36: 100%|██████████| 18029/18029 [07:39<00:00, 39.22it/s]
12/36: 100%|██████████| 18060/18060 [07:25<00:00, 40.58it/s]
13/36: 100%|██████████| 20396/20396 [08:40<00:00, 39.19it/s]
14/36: 100%|██████████| 20483/20483 [08:52<00:00, 38.43it/s]
15/36: 100%|██████████| 18551/18551 [07:58<00:00, 38.74it/s]
16/36: 100%|██████████| 18022/18022 [09:44<00:00, 30.82it/s]
17/36: 100%|██████████| 18340/183

In [8]:
i = 4
total_num = len(dataX[i])
labels = np.array(dataY[i])

all_features = []
all_yolo_features = []
lengths = []

for j in range(total_num):
    feats = []
    yolo_feats = []
    if isinstance(dataX[i][j], list):
        for k in range(len(dataX[i][j])):
            temp_feats, temp_yolo_feats = get_features(dataX[i][j][k], j+1, total_num)
            feats.extend(temp_feats)
            yolo_feats.extend(temp_yolo_feats)
    else:
        feats, yolo_feats = get_features(dataX[i][j], j+1, total_num)
    all_features.append(feats)
    all_yolo_features.append(yolo_feats)
    lengths.append(len(feats))

all_features = np.array(all_features, dtype="object")
all_yolo_features = np.array(all_yolo_features, dtype="object")
lengths=np.array(lengths)

np.savez(f"/kaggle/working/fold_{i+1}", features=all_features, yolo_features=all_yolo_features, lengths=lengths, labels=labels)

1/36: 100%|██████████| 20013/20013 [08:06<00:00, 41.12it/s]
2/36: 100%|██████████| 18195/18195 [07:11<00:00, 42.20it/s]
3/36: 100%|██████████| 21683/21683 [08:42<00:00, 41.48it/s]
4/36: 100%|██████████| 18077/18077 [09:07<00:00, 32.99it/s]
5/36: 100%|██████████| 18044/18044 [07:20<00:00, 40.97it/s]
6/36: 100%|██████████| 18062/18062 [09:15<00:00, 32.53it/s]
7/36: 100%|██████████| 17916/17916 [08:57<00:00, 33.33it/s]
8/36: 100%|██████████| 18191/18191 [08:52<00:00, 34.13it/s]
9/36: 100%|██████████| 9433/9433 [03:10<00:00, 49.40it/s]
9/36: 100%|██████████| 34753/34753 [11:21<00:00, 51.01it/s]
10/36: 100%|██████████| 18767/18767 [09:50<00:00, 31.78it/s]
11/36: 100%|██████████| 18076/18076 [09:15<00:00, 32.55it/s]
12/36: 100%|██████████| 18760/18760 [10:01<00:00, 31.21it/s]
13/36: 100%|██████████| 18494/18494 [06:50<00:00, 45.10it/s]
14/36: 100%|██████████| 18179/18179 [06:58<00:00, 43.47it/s]
15/36: 100%|██████████| 18362/18362 [06:52<00:00, 44.54it/s]
16/36: 100%|██████████| 10901/10901 