Load Some Large Datasets In Streaming Mode

In [1]:
from datasets import load_dataset
# Laion OA convos
chat_dataset = load_dataset('OpenAssistant/oasst1', split = 'validation')

In [2]:
# From LAION notebook on how to work with the dataset
import pandas as pd
from treelib import Tree

chat_df = chat_dataset.to_pandas()

In [3]:
# helper function from LAION notebook
def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

In [4]:
# Get a random message tree
tree_id = chat_df["message_tree_id"].sample(1).values[0]
message_tree = chat_df.query(f"message_tree_id == '{tree_id}'").sort_values("created_date") # All entries in that tree
message_tree = add_tree_level(message_tree)

In [5]:
message_tree

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,tree_level
0,b3a054b8-81f9-447e-94a4-0241b785b073,,9da1e24f-58ff-404c-a71c-827aaef7d73e,2023-02-10T04:03:43.939027+00:00,如果我需要运行一个参数量为176B的语言模型，请问我分别大致需要多少GB显存来进行训练和推断？,prompter,zh,3,True,False,,False,,,b3a054b8-81f9-447e-94a4-0241b785b073,ready_for_export,"{'name': ['+1', '_skip_reply'], 'count': [1, 8]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",0
1,7378c219-30f1-46e2-8ec1-75dcaf134039,b3a054b8-81f9-447e-94a4-0241b785b073,1b1f9abf-a35d-49dc-a71d-49bdfdd79111,2023-02-20T12:39:33.184210+00:00,运行一个参数量为176B的语言模型需要的显存大小取决于所使用的硬件和训练和推断的批处理大小等...,assistant,zh,3,True,False,0.0,False,,,b3a054b8-81f9-447e-94a4-0241b785b073,ready_for_export,"{'name': ['_skip_labeling'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
2,2a5ae932-4b7b-499c-a01d-cf14aa0a6b7d,b3a054b8-81f9-447e-94a4-0241b785b073,a7ee8fe3-b13e-4629-b8c0-281fb3035f64,2023-02-21T06:36:00.289792+00:00,对不起呢，我不知道要怎么回答,assistant,zh,3,True,False,2.0,False,,,b3a054b8-81f9-447e-94a4-0241b785b073,ready_for_export,"{'name': ['-1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
3,bc069a65-8717-496e-90ce-d5657a8e5449,b3a054b8-81f9-447e-94a4-0241b785b073,0be7fae6-101e-4f37-a4ea-94605b507b8e,2023-03-03T02:15:34.652904+00:00,一般来说，对于大型的语言模型，需要使用高端的GPU硬件来进行训练和推断。具体到需要多少GB的...,assistant,zh,3,True,False,1.0,False,,,b3a054b8-81f9-447e-94a4-0241b785b073,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1


Some notes:
- Message tree has many repeat messages
- This is because theres multiple AI responses that are ranked by user
- Best way to extract a coherent conversation is:
- Randomly pick one of the last prompter messages and follow its parent_id up the tree to create a list of the text associated with those rows
- Reverse the list to have a conversation

In [6]:
def sample_conversation_from_tree(message_tree):
    # Randomly pick one of the last prompter messages
    last_prompter_messages = message_tree[message_tree["tree_level"] == message_tree["tree_level"].max()]
    selected_message = last_prompter_messages.sample(1)

    # Follow its parent_id up the tree to create a list of the text associated with those rows
    conversation = []
    current_id = selected_message["message_id"].values[0]

    while current_id is not None:
        current_row = message_tree[message_tree["message_id"] == current_id]
        conversation.append(current_row["text"].values[0])
        current_id = current_row["parent_id"].values[0]

    # Reverse the list to have a conversation
    conversation.reverse()

    return conversation

def sample_conversations(batch_size):
    conversations = []
    for _ in range(batch_size):
        tree_id = chat_df["message_tree_id"].sample(1).values[0]
        message_tree = chat_df.query(f"message_tree_id == '{tree_id}'").sort_values("created_date")
        message_tree = add_tree_level(message_tree)
        conversation = sample_conversation_from_tree(message_tree)
        conversations.append(conversation)
    return conversations

In [8]:
convos = sample_conversations(4)


# SD Pipeline

SD pipeline to quickly generate images (this makes it easier to generate random things and experiment)

In [5]:
from diffusers import AutoPipelineForText2Image
import torch

pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
pipe.to("cuda")

def generate(prompt):
    return pipe(prompt=prompt, num_inference_steps = 1, guidance_scale = 0.0).images[0]

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


model_index.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

scheduler/scheduler_config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

text_encoder_2/config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer_2/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer_2/tokenizer_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

tokenizer_2/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/5.14G [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

tokenizer_2/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
generate("A cinematic shot of a baby racoon wearing an intricate italian priest robe.").show()

  0%|          | 0/1 [00:00<?, ?it/s]

# Prompt Engineering

In [9]:
import openai
from secret import API_KEY
openai.api_key = API_KEY

In [47]:
class ChatHarness:
    def __init__(self, init_prompt = "You are a helpful assistant", model = "gpt-4-1106-preview"):
        self.messages = [{
            "role" : "system", "content" : init_prompt
        }]
        self.model = model
    def __call__(self, msg):
        self.messages.append({
            "role" : "user", "content" : msg
        })
        res = openai.chat.completions.create(
          model=self.model,
          messages=self.messages
        ).choices[0].message.content
        
        self.messages.append({
            "role" : "assistant", "content" : res
        })
        
        return res

In [31]:
chat = ChatHarness()
print(chat("Hello, I'm looking for places to visit in paris during my trip there. Any ideas?"))

Of course! Paris is a beautiful city filled with incredible attractions. Here are some must-visit places to consider during your trip:

1. Eiffel Tower: No trip to Paris is complete without a visit to the iconic Eiffel Tower. You can go up to the top for magnificent views of the city.

2. Louvre Museum: Explore the world's largest art museum and admire its vast collection, including the famous Mona Lisa.

3. Notre-Dame Cathedral: Marvel at the stunning Gothic architecture of this historic cathedral located on the Île de la Cité.

4. Montmartre: Visit this charming neighborhood and explore Sacré-Cœur Basilica, stroll through its charming streets, and enjoy the bohemian atmosphere.

5. Champs-Élysées: Walk along this famous avenue, lined with shops, cafes, and landmarks such as the Arc de Triomphe.

6. The Seine River: Take a relaxing cruise along the river to see many iconic landmarks like the Louvre, Notre-Dame, and the Eiffel Tower.

7. Palace of Versailles: Just outside of Paris, the

In [32]:
print(chat("Is the park near the eiffel tower dog friendly?"))

Yes! The park near the Eiffel Tower, called Champ de Mars, is indeed dog-friendly. It's a pleasant green space where you can take your furry friend for a walk or have a picnic. However, do keep in mind that there might be certain areas or sections within the park where dogs might need to be kept on a leash, so it's always a good idea to check the local regulations to ensure a positive experience for everyone.


In [33]:
data_synth_prompt = \
"""You will be presented with two things: 1. a conversation between a human and an assistant.
and 2. An unrelated assortment of pieces of non-text media. You will not be shown the non-text media, but
will be given comprehensive textual descriptions of the media (i.e. 
[[image1] a picture of a golden retreiver playing fetch with his owner who is in a red t shirt. they are both in a park),
[[image2] a dining room with a vase in the center. the vase is full of flowers but the flowers are wilting and should be watered,
[[audio1] the sound of a dog barking. it is humorous and features a human laughing at the odd barking]])
You must insert the media into the conversation as if the conversation brought up the media. Consider this simple
example conversation:
["User: Hello!", 
"Assistant: Hi, how can I help you?",
"User: I was looking for places to explore in Paris, any ideas?",
"Assistant: To start, I'd reccomend the Eiffel Tower, the Louvre, and the Notre Dame. Let me know if you need any more suggestions!",
"User: Thank you! That's a good place to leave off I think",
"Assistant: You're welcome! Let me know if there's anything else you need my help with"]
You must insert the media in a way that makes sense and naturally adds to the conversation. Only the user can send non-text media.
To show that the user sent non-text media, simply insert [image1], [image2] etc. to represent the media. Use the
descriptions on the media to dictate how they contribute to the conversation. Try to make inferences about the photo from the description,
rather than just stating the description itself. Remember, ONLY THE USER can send non-text media,
but the assistant can discuss the media the user sent (and see it). Return a conversation as a list of strings. Here's
an example of how the previous media could be added to the conversation:
["User: Hello!", 
"Assistant: Hi, how can I help you?",
"User: I was looking for places to explore in Paris, any ideas?",
"Assistant: To start, I'd reccomend the Eiffel Tower, the Louvre, and the Notre Dame. Let me know if you need any more suggestions!",
"User: Thank you! I'm actually also travelling there with my dog.",
"Assistant: Oh I love dogs! Could I see a picture of your dog?",
"User: Sure! Here you go [image1]"
"Assistant: Oh wow! A golden retreiver! So cute!!! I like your red shirt. What park was this in?",
"User: It's actually a local park near my house. He gets super excited whenever I go there",
"Assistant: That's adorable! If you're taking him to Paris I'm sure he'd love the park near the Eiffen Tower. It's called Champ de Mars and is in fact dog friendly.",
"User: Oh wow, that's great news! We'll be sure to check it out. This is unrelated but hes making a really weird noise right now. Check this out [audio1]",
"Assistant: Haha! He sounds so silly. Is there anything else involving your trip you need help with?",
"User: Not really. I would like some help with home decor though if that's possible.",
"Assistant: Sure! What do you need help with?",
"User: Well here's my dining room. Any way to make it more colorful?",
"Assistant: Oh no! It looks like those flowers are wilting. You should water them as soon as possible.",
"User: Oops! I'm going to go do that ASAP. Thanks for the help.",
"Assistant: No problem. Let me know if you need any further help with home decor."]
"""
chat = ChatHarness(data_synth_prompt)

You will be presented with two things: 1. a conversation between a human and an assistant.
and 2. An unrelated assortment of pieces of non-text media. You will not be shown the non-text media, but
will be given comprehensive textual descriptions of the media (i.e. 
[[image1] a picture of a golden retreiver playing fetch with his owner who is in a red t shirt. they are both in a park),
[[image2] a dining room with a vase in the center. the vase is full of flowers but the flowers are wilting and should be watered,
[[audio1] the sound of a dog barking. it is humorous and features a human laughing at the odd barking]])
You must insert the media into the conversation as if the conversation brought up the media. Consider this simple
example conversation:
["User: Hello!", 
"Assistant: Hi, how can I help you?",
"User: I was looking for places to explore in Paris, any ideas?",
"Assistant: To start, I'd reccomend the Eiffel Tower, the Louvre, and the Notre Dame. Let me know if you need any more 

In [48]:
def augment_convo(conversation, media_list):
    """
    Augment a convo by adding media into it. Convo should be list of strings.
    Media list should be given as list of tuples with labels for their type "image" and then captions describing them
    """
    for i in range(len(conversation)):
        if i % 2 == 0:
            conversation[i] = "User: " + conversation[i]
        else:
            conversation[i] = "Assistant: " + conversation[i]
            
    res = "MEDIA INFO:\n"
    accums = {}
    for (label, caption) in media_list:
        if label in accums:
            accums[label] += 1
        else:
            accums[label] = 1
        res += f"caption{accums[label]}: {caption}\n"
    res += f"CONVERSATION:\n{conversation}"
    res += "\nNow please generate the augmented conversation as a list of strings"
    chat = ChatHarness(data_synth_prompt)
    msg = chat(res)
    #print(msg)
    msg = chat("Rejected. Assistant cannot send images. Please try again.")
    print(msg)
    #media = msg[msg.find("[")+1:msg.rfind("]")]
    #return media

In [49]:
augment_convo(convos[0], [("image", "A cinematic shot of a baby racoon wearing an intricate italian priest robe."), ("image", "a reindeer pulls santas sled by itself, digital artwork")])

["User: If I found a dollar bill, would it be illegal to keep it ?",
"Assistant: No, it would not be illegal to keep a dollar that you found unattended in public. However, if you \"found\" a dollar in a tip-jar, on a table at a restaurant, or in someone's home, that would be considered stealing.",
"User: Should someone who finds lost property be compensated for their efforts in trying to return it to its owner?",
"Assistant: Morally speaking, it’s nice to offer a reward to someone who went out of their way to return something lost. Legally, there is generally no obligation to compensate someone for returning your property, but some people do anyway as a token of gratitude.",
"User: Ok, What would be the best way to contact the original owner?",
"Assistant: If you find a lost item like a dollar bill with no identification, there isn't much you can do to find the original owner. With items that can be traced back to the owner, it's best to use any available contact information, or turn i