# Data Preprocessing

## Import libraries

In [2]:
import os
import json
from pathlib import Path

import cv2
import numpy as np

In [3]:
root_dir = Path(os.getcwd()).parent.parent
root_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor')

## Load data

In [4]:
data_dir = root_dir / 'data' / 'brand_extractor'
data_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor')

In [5]:
video_dir = data_dir / 'videos'
video_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/videos')

In [6]:
frame_dir = data_dir / 'frames'
frame_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/frames')

In [7]:
video_files = list(video_dir.glob('*.mp4'))

In [8]:
video_files

[PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/videos/tanishq_mishra_vlogs_lulu_mall.mp4'),
 PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/videos/the_explorer_raj_dlf_mall.mp4'),
 PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/videos/travel_with_chris_crazy_market_spree.mp4')]

In [9]:
video_file = video_files[0]

In [10]:
video_name = video_file.stem
video_name

'tanishq_mishra_vlogs_lulu_mall'

## Load video

In [11]:
video_capture = cv2.VideoCapture(str(video_file))

In [12]:
video_capture.isOpened()

True

In [13]:
video_fps = video_capture.get(cv2.CAP_PROP_FPS)
video_fps

60.0

In [14]:
total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
total_frames

71990

In [15]:
video_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))
video_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
video_width, video_height

(1920, 1080)

## Process Video

In [16]:
sample_fps = 1

In [17]:
video_frame_dir = frame_dir / video_name
video_frame_dir.mkdir(exist_ok=True)
video_frame_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/frames/tanishq_mishra_vlogs_lulu_mall')

In [18]:
image_dir = video_frame_dir / 'images'
image_dir.mkdir(exist_ok=True)
image_dir

PosixPath('/mnt/d/Projects_D/Brand_Extractor/data/brand_extractor/frames/tanishq_mishra_vlogs_lulu_mall/images')

In [19]:
frame_data = {
    "video": video_name,
    "fps": video_fps,
    "total_frames": total_frames,
    "width": video_width,
    "height": video_height,
    "sample_fps": sample_fps,
    "frames": []
}

In [24]:
# extract every 1 seconds
for i in range(0, total_frames, int(video_fps/sample_fps)):
    video_capture.set(cv2.CAP_PROP_POS_FRAMES, i)
    ret, frame = video_capture.read()
    if ret:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (1280, 720))
        frame_file = video_frame_dir / 'images' /f'{i//int(video_fps/sample_fps):04d}.png'
        frame_data["frames"].append({
            "frame": f'{i//int(video_fps/sample_fps):04d}',
            "frame_file": str(frame_file),
            "frame_timestamp": i / video_fps
            
        })
        cv2.imwrite(str(frame_file), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

In [25]:
frame_data["sample_frames"] = len(frame_data["frames"])

In [26]:
frame_data.keys()

dict_keys(['video', 'fps', 'total_frames', 'width', 'height', 'sample_fps', 'frames', 'sample_frames'])

In [27]:
# save frame data
frame_data_file = video_frame_dir / 'frame_data.json'
with open(frame_data_file, 'w') as f:
    json.dump(frame_data, f)

In [28]:
video_capture.release()

# End