In [None]:
import json 
import random
import itertools
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain

In [None]:
def get_ep_text(ep_number):
    """ Fetch episode """

    episode_id="0"+str(ep_number) 
    file_path='audio_transcription/%s.txt'%str(episode_id)
    transcript=pd.read_csv(file_path,sep='\t',header=None)
    transcript.columns=['links','time','chunks']
    transcript['clean_chunks']=transcript['chunks'].astype(str).apply(lambda x: x.strip())
    texts = transcript['clean_chunks'].str.cat(sep=' ')
    return texts 

def gen_questions(txt,N,chunk):
    " Generate N questions from context of chunk chars "
 
    n = len(txt)
    starting_indices = [random.randint(0,n-chunk) for _ in range(N)]
    sub_sequences = [txt[i:i+chunk] for i in starting_indices]
    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature = 0))
    eval_set = []
    for i, b in enumerate(sub_sequences):
        print("%s"%str(i+1))
        try:
            qa = chain.run(b)
            eval_set.append(qa)
        except:
            print("Error")
    return eval_set

# Pick random episodes
all_eval = []
random_eps = [random.randint(1, 121) for _ in range(20)]
for ep_number in random_eps:
    print("EPISODE: %s"%ep_number)
    txt = get_ep_text(ep_number)
    # Generate 5 questions from context of 5000 chars
    qus = gen_questions(txt,5,5000)
    all_eval.append(qus)

eval_set_full = list(itertools.chain.from_iterable(all_eval))
print(len(eval_set_full))

# Save
with open('eval/eval_set.json', 'w') as fout:
    json.dump(eval_set_full, fout)