## Samsum dataset
- Easy access
- Decent quality
- Some commercial limitations noted: https://huggingface.co/datasets/samsum

In [None]:
!pip install py7zr
!pip list | grep 'py7zr'

In [None]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("samsum")
samsum = pd.DataFrame(dataset["test"])
samsum.iloc[0].dialogue.split("\n")


## GPT generative conversation
- Using microsoft/DialoGPT-medium and large
- Pytorch weight loading issues noted with blenderbot large/small variants: https://huggingface.co/transformers/model_doc/blenderbot.html

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")


In [None]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(
        input(">> User:") + tokenizer.eos_token, return_tensors="pt"
    )

    # append the new user input tokens to the chat history
    bot_input_ids = (
        torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
        if step > 0
        else new_user_input_ids
    )

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(
        bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
    )

    # pretty print last ouput tokens from bot
    print(
        "DialoGPT: {}".format(
            tokenizer.decode(
                chat_history_ids[:, bot_input_ids.shape[-1] :][0],
                skip_special_tokens=True,
            )
        )
    )


from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration

mname = 'facebook/blenderbot_small-90M'
model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
UTTERANCE = "My friends are cool but they eat too many carbs."
print("Human: ", UTTERANCE)
inputs = tokenizer([UTTERANCE], return_tensors='pt')

reply_ids = model.generate(**inputs)
print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
# what kind of carbs do they eat? i don't know much about carbs.

## ParlAI
- Options for notebooks/source as well as CLI scripts configured across a variety of tasks
- Able to create CLI chat interface, with human in the loop (chit-chat task)
- Able to loop model back onto itself (model-model), useful for programmatically generating large conversational datasets?
- Grammar/sentence formation is an issue, though this is not an issue in terms of down-stream TTS

In [None]:
!pip install parlai

In [None]:
# interactive prompts
parlai interactive --model-file zoo:blender/blender_90M/model --task convai2

In [None]:
# tasks - 
# models - zoo:blender/blender_90M/model

parlai self_chat --model-file zoo:blender/blender_90M/model --task convai2 --inference topk --num-self-chats 1 --display-examples True --datatype valid --outfile 'self_chat_test.jsonl'
parlai self_chat --model-file zoo:blender/blender_400Mdistill/model --task blended_skill_talk --inference topk --num-self-chats 1 --display-examples True --datatype valid --outfile 'self_chat_test.jsonl'

In [None]:
# self chat
parlai self_chat --model-file zoo:blender/blender_90M/model --task convai2 --inference topk --num-self-chats 10 --display-examples True --datatype valid

In [None]:
# seed with below conversation_starters text file
/home/samhardyhey/.local/bin/parlai self_chat --model-file zoo:blender/blender_90M/model --task convai2 --inference topk --num-self-chats 2 --selfchat-max-turns 6 --display-examples True --datatype valid --seed-messages-from-file /home/samhardyhey/conversation_starters.txt

In [None]:
%%writefile /home/samhardyhey/conversation_starters.txt
Ponzi scheme insurer
Time wasters and SUPER expensive
Bramdon was an excellent customer service provider rather than an agent.

## Invoke ParlAI via script
- inclusion within final synthesize_transcripts.py

In [None]:
import subprocess

# model
# task
# num self chats
# outfile

subprocess.call(['parlai','self_chat','--model-file','zoo:blender/blender_90M/model', '--task','convai2','--inference','topk','--num-self-chats','10','--display-examples','True','--datatype','valid', '--outfile','self_chat_test.jsonl'])

In [None]:
import srsly
import pandas as pd
import shortuuid

# list(srsly.read_jsonl('./self_chat_test.jsonl'))[0]
# list(output_dir.glob("./*.mp3"))

def format_episode(episode_raw):
    # as a dataframe
    episode = []
    for e in episode_raw["dialog"]:
        episode.extend([{"id": ee["id"], "text": ee["text"]} for ee in e])

    return (
        pd.DataFrame(episode)
        .reset_index()
        .rename(mapper={"index": "exchange_index"}, axis="columns")
        .assign(speaker=lambda x: x.id.apply(lambda y: int(y.split("_")[1])))
        .assign(episode_id=lambda x: [shortuuid.uuid() for e in range(x.shape[0])])
    )


def synthesize_tts_episode(episode_df, output_dir):
    # given an episode DF, synthesize audio for each utterance
    for idx, e in episode_df.iterrows():
        time.sleep(1)  # prevent IP banning?
        # format text/save file
        save_path = output_dir / f"{e.exchange_index}_speaker_{e.speaker}.mp3"

        # alternative voices, useful for debugging, could be improved with more variance
        if e.speaker == 1:
            tts = gtts.gTTS(e.text, lang="en", tld="com", slow=True)
        elif e.speaker == 2:
            tts = gtts.gTTS(e.text, lang="en", tld="ca", slow=True)

        tts.save(save_path)

# base_output_dir = Path("/path/to/synthesis")
# # a collection of self-chat episodes
self_chat = list(srsly.read_jsonl("./self_chat_test.jsonl"))

# for episode in self_chat:
episode = self_chat[0]
# format episode
episode = format_episode(episode).head(8)

# # create unique output dir
# episode_id = uuid4().hex
# output_dir = base_output_dir / episode_id
# output_dir.mkdir(exist_ok=True, parents=True) if output_dir.exists() == False else None

# # synthesize, save audio
# synthesize_tts_episode(episode, output_dir)

In [None]:
!pip install shortuuid
!pip list | grep 'shortuuid'