# Abo16 Corpus

In [1]:
import pandas as pd

In [2]:
lang_map = pd.read_csv("../data/lang_map.csv", names="filename lang".split())
# lang_map = lang_map.loc[~lang_map.filename.str.contains("c5008-22"), :]

In [3]:
lang_map.lang.value_counts()

Amis           15
Yami           15
Sakizaya       15
Pinuyumayan    14
Bunun          14
Saysiyat       14
Paiwan         14
Truku          14
Thau           14
Seediq         14
Atayal         14
Hla'alua       13
Kavalan        13
Rukai          12
Cou            12
Kanakanavu     12
Name: lang, dtype: int64

In [4]:
file_items = lang_map.iloc[::-1].reset_index(drop=True).to_dict(orient="index")

In [5]:
import imageio
from pathlib import Path
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw, ImageFont
import cv2
import face_recognition
from tqdm.auto import tqdm

In [6]:
def make_video_path(video_name):
    video_dir = Path("../data/videos")
    date_dir = video_name.split("-")[1].replace(".mp4", "")
    video_path = video_dir / date_dir / video_name
    return video_path

In [7]:

def snapshot(video_path):    
    video = cv2.VideoCapture(str(video_path))
    fps = video.get(cv2.CAP_PROP_FPS)
    video.set(cv2.CAP_PROP_POS_FRAMES, int(23 * fps))
    ok, im = video.read()
    if not ok:
        raise IOError("Error when reading video")
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    return im

In [11]:
def annotate_image(im_arr):
    im = Image.fromarray(im_arr)
    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 36)
    draw = ImageDraw.Draw(im)        
    draw.rectangle([(40,10), (1000, 110)], fill="lightgray")
    draw.text((50,20), video_name, fill="blue", font=font)
    draw.text((50,60), lang, fill="blue", font=font)  
    faces = face_recognition.face_locations(im_arr)    
    for face_x in faces:
        t,r,b,l = face_x        
        draw.rectangle([(l,t),(r,b)], outline="orange", width=4)
    return im

In [12]:
sandbox_dir = Path("../data/sandbox/lang_checks")
sandbox_dir.mkdir(parents=True, exist_ok=True)

In [13]:
fout = open(sandbox_dir/"error_list.txt", "w")
for item_x in tqdm(file_items.values()):
    try:
        video_name = item_x["filename"]
        lang = item_x["lang"]
        video_path = make_video_path(video_name)
        im_arr = snapshot(video_path)
        im_annot = annotate_image(im_arr)
        im_annot.save(sandbox_dir/video_name.replace(".mp4", ".lang.jpg"))
    except Exception as ex:        
        fout.write(f"{video_name}, {ex}\n")
        print("Error", item_x)          
fout.close()    

  0%|          | 0/219 [00:00<?, ?it/s]

Error {'filename': 'c5008-2112101100.mp4', 'lang': 'Rukai'}


[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37ea100] moov atom not found


Error {'filename': 'c5008-2112082000.mp4', 'lang': 'Sakizaya'}


[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found


Error {'filename': 'c5008-2112061100.mp4', 'lang': 'Atayal'}


[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found


Error {'filename': 'c5008-2112021100.mp4', 'lang': 'Truku'}
Error {'filename': 'c5008-2112011100.mp4', 'lang': 'Cou'}
Error {'filename': 'c5008-2111302000.mp4', 'lang': 'Yami'}
Error {'filename': 'c5008-2111282000.mp4', 'lang': 'Pinuyumayan'}


[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x55b1b37eaa80] moov atom not found


Error {'filename': 'c5008-2111242000.mp4', 'lang': 'Sakizaya'}
