In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import ffmpeg
from tqdm import tqdm

In [2]:
def join_sentence(df:pd.DataFrame) -> pd.Series:
	series = df.iloc[0]
	series['w'] = ' '.join(df['w'])
	series['e'] = df.iloc[-1]['e']
	series['a'] = df['a'].min()
	return series

In [3]:
transcripts = dict[int, pd.DataFrame]()

for lec in range(1,11):
	transcript = pd.read_json(
		f'Data/Text/Lec{lec}.json',
		orient='records',
		dtype={'a': np.uint8}, # type: ignore
	)
	transcript.drop(columns=['i','t'], inplace=True)
	transcript[['s','e']] /= 1000
	# transcript

	sentences = transcript.groupby(
		transcript['w'].str.endswith('.').shift(fill_value=False).cumsum(),
		as_index=True,
	).apply(join_sentence)

	sentences.index.name = 'i'
	sentences = sentences[sentences['a'] >= 70]
	sentences = sentences[sentences['w'].str.count(' ') >= 3]
	sentences.reset_index(inplace=True)
	# np.savetxt(f'Data/Text/Lec{lec}.txt',(f'Data/Audio/Lec{lec}/'+sentences['i'].apply(str)+'.wav|'+sentences['w']),fmt='%s')
	transcripts[lec] = sentences

In [None]:
for lec in range(3,4):
	Path(f'Data/Audio/Lec{lec}').mkdir(parents=True,exist_ok=True)
	# mp4 (AAC) -> wav (PCM)
	ffmpeg.input(f'Data/Video/Lec{lec}.mp4').output(f'Data/Audio/Lec{lec}/.wav').overwrite_output().run()

	# clips from the whole wav
	for thread in tqdm([
		ffmpeg.input(f'Data/Audio/Lec{lec}/.wav')
			.output(f'Data/Audio/Lec{lec}/{t.i}.wav', ss=t.s, to=t.e)
			.overwrite_output()
			.run_async()
		for t in tqdm(transcripts[lec].itertuples(), total=len(transcripts[lec]))
	]): thread.wait() # wait until all is done

In [5]:
# from scipy.io import wavfile
# wavfile.read('Data/Audio/Lec1-1.wav')