<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Install-gentle" data-toc-modified-id="Install-gentle-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Install gentle</a></span></li><li><span><a href="#Test-aligning-transcript-with-local-video" data-toc-modified-id="Test-aligning-transcript-with-local-video-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Test aligning transcript with local video</a></span></li><li><span><a href="#Test-aligning-transcript-using-scanner-pipeline" data-toc-modified-id="Test-aligning-transcript-using-scanner-pipeline-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Test aligning transcript using scanner pipeline</a></span></li><li><span><a href="#Analyze-result" data-toc-modified-id="Analyze-result-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Analyze result</a></span></li><li><span><a href="#complete-transcript-loading" data-toc-modified-id="complete-transcript-loading-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>complete transcript loading</a></span></li><li><span><a href="#Check-audio-time-&gt;-frame-time" data-toc-modified-id="Check-audio-time->-frame-time-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Check audio time &gt; frame time</a></span></li></ul></div>

# Install gentle 

1.
git clone https://github.com/scanner-research/gentle

2.
bash ./install.sh (takes ~30min)   

# Test aligning transcript with local video

In [None]:
import pickle
from query.models import Video
import scannerpy
import os

get_ipython().magic('reload_ext autoreload')
get_ipython().magic('autoreload 2')

In [None]:
# set test video list
video_list = ['CNNW_20121201_120000_Weekend_Early_Start']

# import pickle
# video_list = pickle.load(open('../app/data/tvnews_std_sample.pkl', 'rb'))['sample_100']

In [None]:
# set srt extension
for video_name in video_list:
    video = Video.objects.filter(path__contains=video_name)[0]
    video.srt_extension = 'word'
    video.save()

In [None]:
from scannertools.transcript_alignment import TranscriptAligner

res_stats = {}
for video_name in video_list:
    print(video_name)
    # download video
    video_path = os.path.join('../data/videos/', video_name+'.mp4')
    if not os.path.exists(video_path):
        gs_path = os.path.join('gs://esper/tvnews/videos/', video_name+'.mp4')
        cmd = 'gsutil cp ' + gs_path + ' ' + '../data/videos/'
        os.system(cmd)
    print('Downloading video done')
    
    # run alignment
    aligner = TranscriptAligner(win_size=300, seg_length=60, max_misalign=10, num_thread=64, estimate=True,
                            media_path=video_path,
                            transcript_path=os.path.join('/app/data/subs10/', video_name),
                            align_dir='/app/data/subs/orig/')
    res = aligner.run_all()
#     res_stats[video_name] = res
#     pickle.dump(res_stats, open('/app/result/test_align_100_hard.pkl', 'wb'))

# Test aligning transcript using scanner pipeline

In [None]:
from scannertools import audio, transcript_alignment
from tqdm import tqdm
SEG_LENGTH = 60

In [None]:
# set test video list
video_list = ['CNNW_20110606_030000_CNN_Presents']
videos = [Video.objects.filter(path__contains=video_name)[0] for video_name in video_list]

# videos = Video.objects.filter(threeyears_dataset=True).all()
addtional_field = pickle.load(open('/app/data/addtional_field_all.pkl', 'rb'))
# videos = [video for video in videos if addtional_field[video.id]['valid_transcript']]
# videos = videos[:30000]

In [None]:
# check database
db = scannerpy.Database()
videos_committed = []
for video in tqdm(videos):
    table_name = '{}_align_transcript'.format(video.path)
    table = db.table(table_name)
    if not table.committed():
        print(video.item_name())
    else:
        videos_committed.append(video)
videos = videos_committed
len(videos)

In [None]:
# load audios from videos
audios = [audio.AudioSource(video.for_scannertools(), 
                                frame_size=SEG_LENGTH, 
                                duration=addtional_field[video.id]['audio_duration']) 
              for video in videos]

# set up transcripts 
captions = [audio.CaptionSource('tvnews/subs10/'+video.item_name(), 
                                max_time=addtional_field[video.id]['audio_duration'] , 
                                window_size=SEG_LENGTH) 
            for video in videos]

In [None]:
# set up run opts
run_opts = {'pipeline_instances_per_node': 32, 'checkpoint_frequency': 5}

# set up align opts
align_opts = {'seg_length' : 60,
              'max_misalign' : 10,
              'num_thread' : 1,
              'exhausted' : False,
#               'align_dir' : None,
#               'res_path' : None,
              'align_dir' : '/app/data/subs/orig/',
              'res_path' : '/app/result/final_align_3y.pkl',
}

In [None]:
transcript_alignment.align_transcript(db, videos_committed, audios, captions, run_opts, align_opts, cache=True) 

# Analyze result

In [None]:
align_stats_first = pickle.load(open('/app/result/align_stats_first.pkl', 'rb'))
align_stats_second = pickle.load(open('/app/result/align_stats_second.pkl', 'rb'))

In [None]:
videos = Video.objects.all()
cnt = 0
video_list = []
clean_file = open('/app/result/clean_subs.txt', 'w')
for video in videos:
    if video.id in align_stats_second and align_stats_second[video.id]['word_missing'] <= 0.2:
#         if cnt < 100:
#             print(video.id, res_stats[video.id]['word_missing'])
#             video_list.append(video.id)
        cnt += 1
        clean_file.write(video.item_name() + '\n')
print(len(videos), cnt)
# print(video_list)
clean_file.close()

In [None]:
# merge second into first
align_stats_final = {id: res for id, res in align_stats_first.items() if res['word_missing'] < 0.2}
for id, res in align_stats_second.items():
    align_stats_final[id] = res
res_list_first = [res['word_missing'] if res['word_missing'] > 0 else 0 for id, res in align_stats_first.items() ]
res_list_final = [res['word_missing'] if res['word_missing'] > 0 else 0 for id, res in align_stats_final.items() if res['word_missing'] < 1]
res_list_first.sort()
res_list_final.sort()

In [None]:
videos = Video.objects.all()
addtional_field = pickle.load(open('/app/data/addtional_field_all.pkl', 'rb'))
cnt = 0
for video in videos:
    if video.id in align_stats_final and align_stats_final[video.id]['word_missing'] <= 0.2:
        addtional_field[video.id]['aligned_transcript'] = True
        cnt += 1
    else:
        addtional_field[video.id]['aligned_transcript'] = False
cnt

In [None]:
pickle.dump(addtional_field, open('/app/data/addtional_field_all.pkl', 'wb'))

In [None]:
import matplotlib.pyplot as plt
plt.plot(res_list_first)
plt.plot(res_list_final)
plt.xlabel('num of videos')
plt.ylabel('mis-aligned ratio')

# complete transcript loading

In [None]:
updates = open('/app/tmp/align2_10.log', 'r').read().split('\n')
print(updates[:4])

updates = [line.split(' ') for line in updates[4:-1]]
len(updates)

In [None]:
res_stats = pickle.load(open('/app/result/align_stats_second.pkl', 'rb'))

In [None]:
for path, r in updates:
    video = Video.objects.filter(path=path)[0]
    res_stats[video.id] = {'word_missing': float(r)}
len(res_stats)

In [None]:
pickle.dump(res_stats, open('/app/result/align_stats_second.pkl', 'wb'))

# Check audio time > frame time

In [None]:
# set test video list
videos = Video.objects.all()
addtional_field = pickle.load(open('/app/data/addtional_field_all.pkl', 'rb'))
videos = [video for video in videos if addtional_field[video.id]['valid_transcript']]

In [None]:
cnt = 0
for video in videos:
    audio_time = addtional_field[video.id]['audio_duration']
    frame_time = video.num_frames / video.fps
    if audio_time / frame_time > 1.1 or audio_time / frame_time < 0.9:
        cnt += 1
cnt

In [None]:
import tempfile
import re
def get_frame_audio_length(video):
    url = video.url()
    log_path = tempfile.NamedTemporaryFile(suffix='.txt').name
    cmd = 'ffprobe -show_streams -i ' + \
        '\"' + url + '\"' + ' > ' + log_path
    os.system(cmd)
    log = open(log_path, 'r')
    format_str = log.read()
    log.close()
    durations = re.findall(r'\nduration=(.*)', format_str)
    return float(durations[0]), float(durations[1])

In [None]:
get_frame_audio_length(Video.objects.filter(path__contains='CNNW_20120815_230000_Erin_Burnett_OutFront')[0])