In [1]:
# Image handling
from PIL import Image, ImageFilter, ImageEnhance
import cv2
import imghdr

# Data handling
import pandas as pd
import numpy as np

# Filesystem  
import os

# Text processing
import pytesseract  
import re

# Dates
from datetime import datetime

In [2]:
def slice_video_on_profile_change(video_path, output_folder, threshold=100):

    # Open video 
    cap = cv2.VideoCapture(video_path)

    # Keep previous frame to compare with
    prev_img = None 

    # Counter for sliced segment numbers
    seg_num = 0

    # Read video frames until end
    while cap.isOpened():

        # Read next frame
        ret, frame = cap.read()

        # Break out of loop if video ended
        if not ret:  
            break

        # Convert colored frame to grayscale
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Compare current frame vs previous  
        # If difference bigger than threshold, profile changed
        if prev_img is None or np.mean(np.abs(gray - prev_img)) > threshold:

            # Save current frame as new profile  
            prev_img = gray.copy()

            # Increment segment counter
            seg_num += 1

            # Write sliced segment frame to file
            cv2.imwrite(f"{output_folder}/segment_{seg_num}.jpg", frame)

    # Release video resource  
    cap.release()

In [3]:
# Set of allowed image formats
IMAGE_FORMATS = {'png', 'jpg', 'jpeg'} 

def ocr_image(image_path, kernel_size=1, contrast_factor=2):

    # Validate image file
    ext = os.path.splitext(image_path)[1].lower()
    if ext[1:] not in IMAGE_FORMATS and imghdr.what(image_path) not in IMAGE_FORMATS: 
        return None
    
    # Load image
    image = Image.open(image_path)

    # Convert to grayscale    
    gray = image.convert('L')
    # Increase contrast
    enhancer = ImageEnhance.Contrast(gray)
    contrast = enhancer.enhance(contrast_factor)
    
    # Apply gaussian blur to remove noise
    blur = contrast.filter(ImageFilter.GaussianBlur(radius=kernel_size))

    # Perform OCR
    text = pytesseract.image_to_string(blur)

    return text

In [4]:
slice_video_on_profile_change('Final.mov', 'slice')

In [5]:
folder = 'slice/'

for filename in os.listdir(folder):
    image_path = os.path.join(folder, filename)

    text = ocr_image(image_path)
    
    # Check if 'Relic' appears in OCR text
    if 'Relic' not in text:
        os.remove(image_path) 
        continue

In [6]:
# Get list of files in folder
files = os.listdir(folder) 

# Filter to only image files
images = [f for f in files if f.endswith(('.jpg', '.png', '.jpeg'))]

# Validate number of images
if len(images) != 30:
    raise ValueError(f"Expected 30 images, found {len(images)}")

print(f"Verified {len(images)} images in {folder}")

Verified 30 images in slice/


In [18]:
test = ocr_image('slice/segment_.jpg')

test

'107,200\n'

In [19]:
# Create empty dataframe 
df = pd.DataFrame(columns=['IGN', 'Relic Donation Points', 'Date'])

# Folder path 
folder_path = 'slice/' 

# Loop through images
for filename in os.listdir(folder_path):

    # Full image path
    image_path = os.path.join(folder_path, filename)  

    # Extract data from image 
    text = ocr_image(image_path)
    
    username = re.search(r'([\w]+)(?=\n)', text).group(1)
    # Get last number
    points = text.split(" ")[-1]
    # Remove non-digits
    points = int(re.sub(r'[^\d]', '', points))
    date = datetime.fromtimestamp(os.path.getmtime(image_path)).date()

    # Create new row
    new_row = {
        'IGN': username,
        'Relic Donation Points': points,
        'Date': date,
        'File': image_path
    }
    # Append row to dataframe
    df = df.append(new_row, ignore_index=True)

    # Drop duplicates
    df = df.drop_duplicates()

In [20]:
df

Unnamed: 0,IGN,Relic Donation Points,Date,File
0,3,45700,2023-07-13,slice/segment_82.jpg
1,Karou8,145100,2023-07-13,slice/segment_41.jpg
2,VibingFlash,83300,2023-07-13,slice/segment_69.jpg
3,MelonBunii,107200,2023-07-13,slice/segment_54.jpg
4,Nickknack,141700,2023-07-13,slice/segment_56.jpg
5,xChocobo,19300,2023-07-13,slice/segment_90.jpg
6,darkvioletfox,46800,2023-07-13,slice/segment_84.jpg
7,Glutenburgh,133300,2023-07-13,slice/segment_52.jpg
8,MarinKitagawa,104900,2023-07-13,slice/segment_50.jpg
9,Minty24,24000,2023-07-13,slice/segment_86.jpg
