# Getting started with OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/master/notebooks/openassistant-oasst1/getting-started.ipynb)

In [None]:
# uncomment and run below lines to set up if running in colab
!pip install datasets pandas treelib

Collecting datasets
  Downloading datasets-2.14.1-py3-none-any.whl (492 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/492.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/492.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting treelib
  Downloading treelib-1.6.4-py3-none-any.whl (18 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCol

# Imports

In [None]:
import pandas as pd
from datasets import load_dataset
from treelib import Tree

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)


def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

# Load Dataset

In [None]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})


# Create Pandas Dataframe

In [None]:
# lets convert the train dataset to a pandas df
df = ds["validation"].to_pandas()

In [None]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4401 entries, 0 to 4400
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       4401 non-null   object 
 1   parent_id        3883 non-null   object 
 2   user_id          4401 non-null   object 
 3   created_date     4401 non-null   object 
 4   text             4401 non-null   object 
 5   role             4401 non-null   object 
 6   lang             4401 non-null   object 
 7   review_count     4401 non-null   int32  
 8   review_result    4374 non-null   object 
 9   deleted          4401 non-null   bool   
 10  rank             2533 non-null   float64
 11  synthetic        4401 non-null   bool   
 12  model_name       0 non-null      object 
 13  detoxify         3796 non-null   object 
 14  message_tree_id  4401 non-null   object 
 15  tree_state       4401 non-null   object 
 16  emojis           3819 non-null   object 
 17  labels        

In [None]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{723: {'message_id': '5966e733-4841-4711-8462-b8afafbb8a7e',
  'parent_id': 'f78ea4b4-5e8a-426d-b2df-ecccb25b5a20',
  'user_id': 'df411b83-5030-4708-bf81-0fe9fa1398b2',
  'created_date': '2023-02-19T19:51:15.260422+00:00',
  'text': '¡Por supuesto! Para evaluar las diferencias en tiempo y costo entre la ruta más rápida y la más corta entre Madrid y Huércal-Overa en un Peugeot 3008 con cuatro ocupantes adultos, necesitaría saber cuáles son las rutas específicas que estás considerando. Pero puedo darte algunas estimaciones generales y herramientas que puedes utilizar para obtener información más precisa.\n\nEn general, la ruta más rápida suele ser la autovía, que en este caso sería la A-92. Esta ruta te llevaría directamente de Madrid a Almería, y luego podrías tomar la A-7 hasta Huércal-Overa. En total, la distancia sería de aproximadamente 640 km y el tiempo estimado de conducción sería de alrededor de 6 horas y media.\n\nLa ruta más corta podría variar según el origen exacto en Madrid

# Random Message Tree

In [None]:
# lets grab a random message tree
#message_tree_id = '078307ad-a148-4211-a18a-7f081c9b4fee'
message_tree_id ='7fc3ab94-ea6e-4126-9bd1-606235214409'
print(message_tree_id)

7fc3ab94-ea6e-4126-9bd1-606235214409


In [None]:
#look at all tree ids
print(set(df["message_tree_id"]))
print(len(set(df["message_tree_id"])))

{'3c5b52fd-2a50-40ac-9e13-0d4b309a0a27', 'fa9368ce-1bfd-412f-be8f-bebe05a0a09a', 'bc0e45fa-3a33-439b-9495-e5b61da183d2', 'ea2bb398-57be-4b0b-a255-f5c7a450092d', '12ba47ac-030a-4508-ba58-e8c2707e5f4e', 'de3736d9-2bf8-4dc5-b393-f2599ad6c913', 'e94d8afb-3b77-47fe-8c89-c267dbc53f95', 'aad4b20e-b2f4-43ec-bdcf-216a0b0e69ca', '02ac48c9-f0ed-4886-aec9-6aad9aef4bba', '67f72f49-142e-473f-885c-ae07bc38c8f9', 'b5b87dbd-51f7-4a79-9fb7-6497f8026b0e', 'b153c646-5b95-49aa-9707-e90fec44c499', '7e6b68dd-7b92-459b-baba-b54d7e38c0b5', 'dd09cb7b-2d10-48f3-8c07-6eab234678e5', 'c1a3af6f-f1d6-4729-9f4f-3b8594bcf1fc', '5e2d03e6-64b3-4890-bbd4-05fe0089e049', '918dd990-6e3f-4fda-b563-b4e40857ccb3', '4c1f080e-7789-4402-b6ed-36c06947a776', '1f2eb40e-776e-46da-b1c3-728d592b91e0', 'b9c30dc7-0b5d-4266-80db-cb1246b929da', '00353343-a4a5-4fb0-96fd-02f529a55181', 'afb7af87-84b0-4f30-bd4d-a2fbbf1c1d5f', 'e4c5240b-3427-49b3-b3c1-be6163f17cd5', '096ec751-a30b-45a3-b8bf-213eddc74b82', '9913dcda-930f-4e5d-9091-bf13f0fbc9e1',

In [None]:
# look at all data for this message tree
df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

# add tree level to df
df_message_tree = add_tree_level(df_message_tree)

df_message_tree

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,tree_level
0,7fc3ab94-ea6e-4126-9bd1-606235214409,,e54e8c27-8b48-4c2c-ab3a-d64bb30f17dc,2023-01-28T22:00:56.087060+00:00,Eu quero que você atue como um console javascr...,prompter,pt-BR,3,True,False,,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",0
1,28a6e25c-9bc5-420e-acbe-3e46d1ee693d,7fc3ab94-ea6e-4126-9bd1-606235214409,0d5b9882-6ab5-44e4-82e5-b8c23590d9e5,2023-02-04T20:27:17.672665+00:00,``\nHello World\n´´,assistant,pt-BR,3,True,False,3.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['+1'], 'count': [3]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
2,37d700b1-73b9-4b56-8fe1-e245b64d1458,7fc3ab94-ea6e-4126-9bd1-606235214409,95c598ef-a93f-487d-bb2f-38448ebcd207,2023-02-04T20:46:05.359131+00:00,```\nHello World\n```,assistant,pt-BR,3,True,False,2.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['+1'], 'count': [2]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
3,630dbd90-aa38-4c77-823a-861be9342557,28a6e25c-9bc5-420e-acbe-3e46d1ee693d,fdaff5f5-473a-46cd-9c89-be1b0762d74a,2023-02-04T21:54:12.645352+00:00,"Certo. O próximo comando é alert(""Essa é uma m...",prompter,pt-BR,3,True,False,,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
4,f676f892-cf7b-49b9-8694-1ed530894812,630dbd90-aa38-4c77-823a-861be9342557,49268390-9274-4b22-9e8c-de65ae8d4666,2023-02-05T18:23:35.572543+00:00,```Box Alert Message\n\n Essa é uma mensagem...,assistant,pt-BR,3,True,False,0.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['_skip_labeling'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",3
5,5ff47565-8ea3-4d74-b445-99de88eba077,630dbd90-aa38-4c77-823a-861be9342557,94ada261-c885-4fe3-b471-000c9983a851,2023-02-05T18:26:10.209122+00:00,``\n[Alerta aparecerá aqui]\n´´,assistant,pt-BR,3,True,False,2.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['-1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",3
6,c546dc05-ab4b-4022-8c9f-1318e6f79c6e,37d700b1-73b9-4b56-8fe1-e245b64d1458,ca9c0ce2-457e-458e-9512-f10360b7221d,2023-02-08T00:44:49.683333+00:00,"ótimo, agora faça este comando, meu segundo co...",prompter,pt-BR,3,True,False,,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['-1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
7,f0f80812-17c5-442f-b5cd-2a04cb1c7881,c546dc05-ab4b-4022-8c9f-1318e6f79c6e,a41f23d1-dc75-497d-8d99-a5b7b061022f,2023-02-09T06:53:18.418097+00:00,"Desculpe, mas você não digitou comando nenhum.",assistant,pt-BR,3,True,False,0.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['+1'], 'count': [2]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",3
8,617d6946-cb18-4b72-9e96-d5f5d061b708,c546dc05-ab4b-4022-8c9f-1318e6f79c6e,cd2932a6-7133-4675-81cb-48738e9b9deb,2023-02-09T11:03:21.151540+00:00,qual é o seu segundo comando?,assistant,pt-BR,3,True,False,1.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,,"{'name': ['spam', 'fails_task', 'lang_mismatch...",3
9,f3336b1d-e5c2-4d32-a4da-a5f53fdc2b3c,7fc3ab94-ea6e-4126-9bd1-606235214409,4a231864-e607-457d-ad47-6b5f75d4624e,2023-02-10T00:44:24.529809+00:00,Hello World,assistant,pt-BR,3,True,False,1.0,False,,,7fc3ab94-ea6e-4126-9bd1-606235214409,ready_for_export,"{'name': ['-1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1


## Create Message Tree

In [None]:
# lets create a tree of message ids
id_tree = Tree()
# lets create a tree of message texts
text_tree = Tree()
# lets set a max char length for the text
max_char_len = 10000

# iterate over rows in df_message_tree
for i, row in df_message_tree.iterrows():
    # grab the message_id, parent_id, text, and parent text
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    text_short = text[:max_char_len] if len(text) > max_char_len else text
    text_short = text_short.replace("\n", " ")
    parent_text = (
        df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0] if parent_id is not None else "ROOT"
    )
    parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
    parent_text_short = parent_text_short.replace("\n", " ")

    # create a node in the id_tree and text_tree, add row as data in case want it later
    id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())

    # if parent_id is None, then it is a root message so dont add parent text as is none
    if parent_id is None:
        text_tree.create_node(text_short, text_short)
    # else use the parent text short as the parent
    else:
        text_tree.create_node(text_short, text_short, parent=parent_text_short)


print("id_tree:")
id_tree.show()
print(id_tree.paths_to_leaves())

print("text_tree:")
text_tree.show()

DuplicatedNodeIdError: ignored

In [None]:
def generate_Id_tree_paths(message_tree_id):
  # look at all data for this message tree
  df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

  # add tree level to df
  df_message_tree = add_tree_level(df_message_tree)
  # lets create a tree of message ids
  id_tree = Tree()
  # lets create a tree of message texts
  text_tree = Tree()
  # lets set a max char length for the text
  max_char_len = 10000

  # iterate over rows in df_message_tree
  for i, row in df_message_tree.iterrows():
      # grab the message_id, parent_id, text, and parent text
      message_id = row["message_id"]
      parent_id = row["parent_id"]
      text = row["text"]
      text_short = text[:max_char_len] if len(text) > max_char_len else text
      text_short = text_short.replace("\n", " ")
      parent_text = (
          df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0] if parent_id is not None else "ROOT"
      )
      parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
      parent_text_short = parent_text_short.replace("\n", " ")

      # create a node in the id_tree and text_tree, add row as data in case want it later
      id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())

      # if parent_id is None, then it is a root message so dont add parent text as is none
      if parent_id is None:
          text_tree.create_node(text_short, text_short)
      # else use the parent text short as the parent
      else:
          text_tree.create_node(text_short, text_short, parent=parent_text_short)
  return id_tree.paths_to_leaves()


In [None]:
def lang_filter(path,df,language_filter):
  for id in path:
    data = df.query(f"message_id == '{id}'")
    lang = data['lang'].values[0]
    if lang != language_filter:
      return False
  return True
def generateQdataset(treeQdata,conversation_paths,df,language_filter="en"):

  for path in conversation_paths:
    if lang_filter(path,df,language_filter) and len(path) >1:
      pathQdata=[] #state,action ,reward , level
      state=""
      ctr = 0
      #loop thru every element in path
      for i in range(0,len(path),2):
        Qdata=[] #[s1,a1,r1,l1]
        level=ctr
        ctr+=1
        reward = 1
        if i+1 < len(path):

          current_prompt_id = path[i]
          promptdata = df.query(f"message_id == '{current_prompt_id}'")
          promptrole = promptdata["role"].values[0] #prompter/assistant
          assert promptrole == "prompter"

          promptData = promptrole + ":" + promptdata["text"].values[0]
          state = state+promptData

          Qdata.append(state)#add current state

          action_id = path[i+1]
          actiondata = df.query(f"message_id == '{action_id}'")
          actionrole = actiondata["role"].values[0] #prompter/assistant
          assert actionrole == "assistant"

          actionData = " "+actionrole + ":" + actiondata["text"].values[0]+" "
          Qdata.append(actionData)#add current state
          state = state+actionData
          Qdata.append(reward)
          Qdata.append(level)
          #next state = state+prompter
          pathQdata.append(Qdata)
      #pathQdata = [[s1,a1,r1,l1],[s2,a2,r2,l2],.....]
      for p in range(len(pathQdata)):
        if p == len(pathQdata)-1:
          done=1
          current_data = pathQdata[p]
          data = {'state':current_data[0],'action':current_data[1],'n_state':current_data[0]+current_data[1],'n_action':None,'reward':current_data[2],'level':current_data[3],'done':done}#s,a,ns,na,r,l,done
        else:
          done = 0
          current_data = pathQdata[p]
          next_data = pathQdata[p+1]
          data = {'state':current_data[0],'action':current_data[1],'n_state':next_data[0],'n_action':next_data[1],'reward':current_data[2],'level':current_data[3],'done':done}#s,a,ns,na,r,l,done
        treeQdata.append(data)
  return treeQdata

In [None]:
#message_tree_id = '078307ad-a148-4211-a18a-7f081c9b4fee'
message_tree_id ='7fc3ab94-ea6e-4126-9bd1-606235214409'
paths = generate_Id_tree_paths(message_tree_id)
print(len(paths))
for p in paths:
  print(p)
jsonTreedata = generateQdataset([],paths,df,language_filter="en")
print(jsonTreedata)

DuplicatedNodeIdError: ignored

In [None]:
import json
print(json.dumps(jsonTreedata, indent = 1))

[
 {
  "state": "prompter:I'm going to be a game master in an RPG, fantasy themed.  Could you suggest an NPC that the players could encounter in the tavern?  Please provide a name, physical description, character notes, and a quest hook to use with them.",
  "action": " assistant:**Name**: Valis the Brawler\n\n**Physical Description**: Valis is a bulky, barrel-chested half-orc with dark green skin and a shock of orange hair. Despite his immense size and intimidating appearance, Valis is a friendly and cheerful soul who loves a good scrap and a good drink. He has a number of scars and a missing ear from years of fighting in the arena.\n\n**Character Notes**: Valis is a retired fighter who now works as the bouncer at a local tavern. He is always ready to lend a hand to those in need, using his martial skills to protect the vulnerable and weak. Valis has a soft spot for lost pups and kittens at the tavern, and will do anything to protect them.\n\n**Quest Hook**: Valis approaches the playe

In [None]:
#now we read all trees that are english and generate a json datafor q learning
from tqdm import tqdm
treeQdata=[]
treeids = set(df["message_tree_id"])
max = 5
ctr=0
for tid in tqdm(treeids):
  try:
    #if tid in exclusions :
    #  continue
    paths = generate_Id_tree_paths(tid)
    treeQdata = generateQdataset(treeQdata,paths,df,language_filter="en")
    ctr+=1
    if ctr > max:
      break
  except:
    print('skipping ',tid , "due to issue")
import json
print(json.dumps(treeQdata[:5], indent = 1))

with open('Qtestdata.json', 'w') as fout:
    json.dump(treeQdata, fout)


  1%|          | 5/518 [00:00<00:34, 14.78it/s]

[
 {
  "state": "prompter:Is there a true, most ethical response to the Trolley Problem, in which a person can either sacrifice the lives of 5 people, by not pulling a lever, or sacrifice the life of 1 person, but have to pull the lever themselves, thus directly causing that death by action, rather than causing 5 deaths, but by inaction?",
  "action": " assistant:I think the ethical response is whatever the individual decides.  There is no universal truth. ",
  "n_state": "prompter:Is there a true, most ethical response to the Trolley Problem, in which a person can either sacrifice the lives of 5 people, by not pulling a lever, or sacrifice the life of 1 person, but have to pull the lever themselves, thus directly causing that death by action, rather than causing 5 deaths, but by inaction? assistant:I think the ethical response is whatever the individual decides.  There is no universal truth. ",
  "n_action": null,
  "reward": 1,
  "level": 0,
  "done": 1
 },
 {
  "state": "prompter:Is




In [None]:
from google.colab import files

files.download('Qdata.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>