In [None]:
# requirements
transformers==4.21.0
datasets==2.4.0
torch==1.10.0
pandas==1.4.3

## Samsum dataset
- Can access just fine
- Some commercial limitations though: https://huggingface.co/datasets/samsum

In [2]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("samsum")
samsum = pd.DataFrame(dataset['test'])

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset samsum (/Users/samhardy/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)
100%|██████████| 3/3 [00:00<00:00, 1102.60it/s]


In [4]:
samsum.iloc[0].dialogue.split('\n')

["Hannah: Hey, do you have Betty's number?",
 'Amanda: Lemme check',
 'Hannah: <file_gif>',
 "Amanda: Sorry, can't find it.",
 'Amanda: Ask Larry',
 'Amanda: He called her last time we were at the park together',
 "Hannah: I don't know him well",
 'Hannah: <file_gif>',
 "Amanda: Don't be shy, he's very nice",
 'Hannah: If you say so..',
 "Hannah: I'd rather you texted him",
 'Amanda: Just text him 🙂',
 'Hannah: Urgh.. Alright',
 'Hannah: Bye',
 'Amanda: Bye bye']

## GPT generative conversation
- Using microsoft/DialoGPT-medium and large
- Pytorch weight loading issues noted with blenderbot large/small variants: https://huggingface.co/transformers/model_doc/blenderbot.html
- EleutherAI GPT neo is ~10GB download or so

In [None]:
from transformers import pipeline
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B')
generator("EleutherAI has", do_sample=True, min_length=50)

# [{'generated_text': 'EleutherAI has made a commitment to create new software packages for each of its major clients and has'}]

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

In [None]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

In [None]:
from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration

mname = 'facebook/blenderbot_small-90M'
model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
UTTERANCE = "My friends are cool but they eat too many carbs."
print("Human: ", UTTERANCE)
inputs = tokenizer([UTTERANCE], return_tensors='pt')
inputs.pop("token_type_ids")
reply_ids = model.generate(**inputs)
print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
# what kind of carbs do they eat? i don't know much about carbs.

## ParlAI
- Options for notebooks/source as well as CLI scripts configured across a variety of tasks
- Able to create CLI chat interface, with human in the loop (chit-chat task)
- Able to loop model back onto itself (model-model), useful for programmatically generating large conversational datasets?
- Grammar/sentence formation is an issue, though this is not an issue in terms of down-stream TTS

In [None]:
# interactive prompts
/home/samhardyhey/.local/bin/parlai interactive --model-file zoo:blender/blender_90M/model --task convai2

In [None]:
# self chat
/home/samhardyhey/.local/bin/parlai self_chat --model-file zoo:blender/blender_90M/model --task convai2 --inference topk --num-self-chats 10 --display-examples True --datatype valid

In [None]:
# seed with below conversation_starters text file
/home/samhardyhey/.local/bin/parlai self_chat --model-file zoo:blender/blender_90M/model --task convai2 --inference topk --num-self-chats 2 --selfchat-max-turns 6 --display-examples True --datatype valid --seed-messages-from-file /home/samhardyhey/conversation_starters.txt

In [3]:
%%writefile /home/samhardyhey/conversation_starters.txt
Ponzi scheme insurer
Time wasters and SUPER expensive
Bramdon was an excellent customer service provider rather than an agent.

Overwriting /home/samhardyhey/conversation_starters.txt
