In [1]:
import numpy as np
import pandas as pd
import ffmpeg
from tqdm import tqdm

In [2]:
def join_sentence(df:pd.DataFrame) -> pd.Series:
	series = df.iloc[0]
	series['w'] = ' '.join(df['w'])
	series['e'] = df.iloc[-1]['e']
	series['a'] = df['a'].min()
	return series

In [4]:
f = open('data/metadata.txt', 'ab')

transcripts = dict[int, pd.DataFrame]()
for lec in range(1,11):
	transcript = pd.read_json(
		f'Text/Lec{lec}.json',
		orient='records',
		dtype={'a': np.uint8}, # type: ignore
	)
	transcript.drop(columns=['i','t'], inplace=True)
	transcript[['s','e']] /= 1000
	# transcript

	sentences = transcript.groupby(
		transcript['w'].str.endswith('.').shift(fill_value=False).cumsum(),
		as_index=True,
	).apply(join_sentence)

	sentences.index.name = 'i'
	sentences = sentences[sentences['a'] >= 70]
	sentences = sentences[sentences['w'].str.count(' ') >= 3]
	sentences.reset_index(inplace=True)
	np.savetxt(f,(f'Lec{lec}-'+sentences['i'].apply(str)+('|'+sentences['w'])*2),fmt='%s')
	transcripts[lec] = sentences

f.close()

In [None]:
for lec in range(1,11):
	# mp4 (AAC) -> wav (PCM)
	ffmpeg.input(f'Video/Lec{lec}.mp4').output(f'data/wavs/Lec{lec}-all.wav', ac=1, ar=22050).overwrite_output().run()
	# clips from the whole wav
	for thread in tqdm([
		ffmpeg.input(f'data/wavs/Lec{lec}-all.wav')
			.output(f'data/wavs/Lec{lec}-{t.i}.wav', ss=t.s, to=t.e)
			.overwrite_output()
			.run_async()
		for t in tqdm(transcripts[lec].itertuples(), total=len(transcripts[lec]))
	]): thread.wait() # wait until all is done

In [5]:
# from scipy.io import wavfile
# wavfile.read('wavs/Lec1-1.wav')

Post processing

In [None]:
import os
import glob
from pathlib import Path

In [9]:
with open('data/metadata.txt', 'w') as o:
	for fp in glob.glob('Text/Lec*.txt'):
		lec = Path(fp).stem
		lec_num = int(lec[3:])
		if lec_num <= 3:
			with open(fp, 'r') as f:
				for l in f.readlines():
					clip, txt = l.split('|')
					id = Path(clip).stem
					o.write('{}-{}|{}|{}'.format(lec, id, txt.strip(), txt))