In [21]:
import numpy as np
import pandas as pd
import os
import ffmpeg
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from itertools import chain

In [23]:
def join_sentence(df:pd.DataFrame) -> pd.Series:
	series = df.iloc[0]
	series['w'] = ' '.join(df['w'])
	series['e'] = df.iloc[-1]['e']
	series['a'] = df['a'].mean()
	# series['min'] = df['a'].min()
	return series

In [25]:
if os.path.exists('data/metadata.txt'): os.remove('data/metadata.txt')

transcripts = {}
with open('data/metadata.txt', 'ab') as f:
	for lec in [1]: # chain(range(1,21),[10.5,18.5]): # [1]:
		data = pd.read_json(
			f'Text/Lec{lec}.json',
			orient='records',
		)
		data.drop(columns=['i','t'], inplace=True)
		data[['s','e']] /= 1000

		sents = sent_tokenize(' '.join(data['w']))
		indices = np.fromiter((sent.count(' ')+1 for sent in sents), int, len(sents)).cumsum() # end of sentence indices
		# for i,(s,e) in enumerate(zip([0]+indices[:-1], indices)):
		# 	assert sents[i] == ' '.join(data['w'][s:e])	# double check correctness
		labels = np.zeros(len(data), int)
		labels[indices[:-1]] = 1 # exclude last index (which is the length)
		labels = labels.cumsum() # sentence labels [1,2,...] for each word

		transcript = data.groupby(labels).apply(join_sentence)
		sent_lens = transcript['w'].str.count(' ')+1 # count by space +1 (ignores contractions, numerals, etc.)
		transcript = transcript[sent_lens/(transcript['e']-transcript['s']) > 1] # >= 60 words per minute (avoiding long pauses)
		# transcript = transcript[sent_lens >= 4] # >=4 words (avoiding short sentences)
		transcript = transcript[transcript['a'] >= 70] # >=70% accuracy (avoiding low accruacy sentences)
		transcripts[lec] = transcript
		transcript.reset_index(inplace=True)
		np.savetxt(f,(f'Lec{lec}-'+transcript.index.astype(str)+('|'+transcript['w'])*2),fmt='%s')

In [None]:
for lec in chain(range(1,21),[10.5,18.5]):
	# mp4 (AAC) -> wav (PCM)
	ffmpeg.input(f'Video/Lec{lec}.mp4').output(f'data/Video/Lec{lec}-all.wav', ar=22050).overwrite_output().run()

	# clips from the whole wav
	for thread in tqdm([
		ffmpeg.input(f'data/Video/Lec{lec}-all.wav')
			.output(f'data/wavs/Lec{lec}-{t.Index}.wav', ss=t.s, to=t.e)
			.overwrite_output()
			.run_async()
		for t in transcripts[lec].itertuples()
	], desc=f'Lec{lec}', leave=False): thread.wait() # wait until all is done

In [5]:
# from scipy.io import wavfile
# wavfile.read('wavs/Lec1-1.wav')

In [4]:
!zip -r data.zip data