In [20]:
import json, os

import re
from typing import List, Dict, Any
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
import pandas as pd

In [5]:
# from huggingface_hub import login 
# login()

In [22]:
def read_jsonl( file_path: str) -> List[Dict]:
        """Read JSONL file and return list of dictionaries"""
        data = []
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    try:
                        data.append(json.loads(line.strip()))
                    except json.JSONDecodeError as e:
                        print(f"Error parsing JSONL line: {e}")
                        continue
        except FileNotFoundError:
            print(f"File {file_path} not found!")
            return []
        return data



In [None]:
with open(""

In [13]:
frenc_instructions = read_jsonl("sources/french_intruct_train.jsonl")

In [14]:
len(frenc_instructions)

275600

In [15]:
good_format = [ {"messages": [i]} for i in frenc_instructions]

In [16]:
good_format[:2]

[{'messages': [[{'role': 'user',
     'text': "Traduisez la déduction suivante en notation logique: Si Elisabeth était musicienne, alors Charles joue au tennis. Pour tout x, si x_12 cuisine, alors x_12 n'est pas un avocat. Pour tout x, x_12 est un avocat. Charles joue au tennis si et seulement si Barbara cuisine. Elisabeth n'est donc pas musicienne."},
    {'role': 'assistant',
     'text': 'p -> q. pour tout x: P(x) -> ~Q(x). Pour tout x: Q ((x). Le groupe de travail a été constitué de: Par conséquent ~p.'}]]},
 {'messages': [[{'role': 'user',
     'text': "Allons-nous un jour manquer de musique ? En raison d'un nombre restreint de sons (quoique important), allons-nous manquer de musique originale ? Explique comme si j'avais cinq ans."},
    {'role': 'assistant',
     'text': 'Cela dépend de la façon dont vous le regardez. Les combinaisons possibles de notes mélodiques, de rythmes, d\'harmonisations, d\'arrangements, etc. sont mathématiquement illimitées, pour toutes les signification

In [19]:
save_file("script/french_intruct_train.jsonl", good_format)

In [23]:
def save_file(path, sequences):
    with open(path, "w", encoding="utf-8") as file:
        for conversation in sequences:
            json.dump(conversation, file, ensure_ascii=False)
            file.write("\n")

In [8]:
from datasets import load_dataset

dataset = load_dataset("angeluriot/french_instruct", split="train")

In [9]:
dataset

Dataset({
    features: ['context', 'conversation', 'author', 'style', 'code', 'source'],
    num_rows: 275600
})

In [10]:
dataset[0]

{'context': '',
 'conversation': [{'role': 'user',
   'text': "Traduisez la déduction suivante en notation logique: Si Elisabeth était musicienne, alors Charles joue au tennis. Pour tout x, si x_12 cuisine, alors x_12 n'est pas un avocat. Pour tout x, x_12 est un avocat. Charles joue au tennis si et seulement si Barbara cuisine. Elisabeth n'est donc pas musicienne."},
  {'role': 'assistant',
   'text': 'p -> q. pour tout x: P(x) -> ~Q(x). Pour tout x: Q ((x). Le groupe de travail a été constitué de: Par conséquent ~p.'}],
 'author': 'chatbot',
 'style': 'chatbot',
 'code': False,
 'source': 'Logic Inference OA (translated)'}

In [11]:
conversations = dataset["conversation"]

In [17]:
test_load = read_jsonl("sources/french_intruct_train.jsonl")

In [22]:
test_load[:2]

[[{'role': 'user',
   'text': "Traduisez la déduction suivante en notation logique: Si Elisabeth était musicienne, alors Charles joue au tennis. Pour tout x, si x_12 cuisine, alors x_12 n'est pas un avocat. Pour tout x, x_12 est un avocat. Charles joue au tennis si et seulement si Barbara cuisine. Elisabeth n'est donc pas musicienne."},
  {'role': 'assistant',
   'text': 'p -> q. pour tout x: P(x) -> ~Q(x). Pour tout x: Q ((x). Le groupe de travail a été constitué de: Par conséquent ~p.'}],
 [{'role': 'user',
   'text': "Allons-nous un jour manquer de musique ? En raison d'un nombre restreint de sons (quoique important), allons-nous manquer de musique originale ? Explique comme si j'avais cinq ans."},
  {'role': 'assistant',
   'text': 'Cela dépend de la façon dont vous le regardez. Les combinaisons possibles de notes mélodiques, de rythmes, d\'harmonisations, d\'arrangements, etc. sont mathématiquement illimitées, pour toutes les significations pratiques du mot "illimité". Néanmoins .

In [23]:
dataset = load_dataset("MemGPT/MSC-Self-Instruct", split="train")

README.md:   0%|          | 0.00/492 [00:00<?, ?B/s]

msc_self_instruct.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [24]:
dataset

Dataset({
    features: ['personas', 'dialog', 'metadata', 'previous_dialogs', 'init_personas', 'personas_update1', 'personas_update2', 'self_instruct', 'summary_speaker_1', 'summary_speaker_2'],
    num_rows: 500
})

In [26]:
dialogs = dataset["dialog"]

In [28]:
dialogs[:2]

[[{'text': 'I won my fight today.',
   'id': 'Speaker 1',
   'convai2_id': 'valid_1',
   'rating': None},
  {'text': "nice! do you feel better? I know you've been losing some fights recently.",
   'id': 'Speaker 2',
   'convai2_id': 'valid_1',
   'rating': None},
  {'text': 'I am. I am really trying to learn from those loses, and it seems to be paying off in practices. AND TODAY! What a fight!!!',
   'id': 'Speaker 1',
   'convai2_id': 'valid_1',
   'rating': None},
  {'text': "That's exciting! Was it a close fight or did you dominate haha",
   'id': 'Speaker 2',
   'convai2_id': 'valid_1',
   'rating': None},
  {'text': "You had to ask, didn't you?! I jest. But it was very close. I was worried I was going to lose at first, but I just stayed focused and in the moment. It really is all about trusting your instincts and not getting in your own way. ",
   'id': 'Speaker 1',
   'convai2_id': 'valid_1',
   'rating': None},
  {'text': "That is true.  A win's a win.  I'm glad you didn't get i

In [5]:
def convert_to_openai_format(dialogs):
    result = []
    
    for dialog in dialogs:
        messages = []
        
        for message in dialog:
            role = "user" if message['id'] == 'Speaker 1' else "assistant"
            messages.append({
                "role": role,
                "content": message['text']
            })
        
        result.append({"messages": messages})
    
    return result


In [30]:
openai_format = convert_to_openai_format(dialogs)

In [31]:
openai_format[:2]

[{'messages': [{'role': 'user', 'content': 'I won my fight today.'},
   {'role': 'assistant',
    'content': "nice! do you feel better? I know you've been losing some fights recently."},
   {'role': 'user',
    'content': 'I am. I am really trying to learn from those loses, and it seems to be paying off in practices. AND TODAY! What a fight!!!'},
   {'role': 'assistant',
    'content': "That's exciting! Was it a close fight or did you dominate haha"},
   {'role': 'user',
    'content': "You had to ask, didn't you?! I jest. But it was very close. I was worried I was going to lose at first, but I just stayed focused and in the moment. It really is all about trusting your instincts and not getting in your own way. "},
   {'role': 'assistant',
    'content': "That is true.  A win's a win.  I'm glad you didn't get into your own head when it was close... maybe you'll be able to get some more wins after this."},
   {'role': 'user', 'content': 'I sure hope so. This fight paid $5,000!'},
   {'r

In [33]:
save_file("sources/MSC-Self-Instruct.json", openai_format)

In [34]:
test_data = read_jsonl("sources/MSC-Self-Instruct.json")

In [35]:
test_data[0]

{'messages': [{'role': 'user', 'content': 'I won my fight today.'},
  {'role': 'assistant',
   'content': "nice! do you feel better? I know you've been losing some fights recently."},
  {'role': 'user',
   'content': 'I am. I am really trying to learn from those loses, and it seems to be paying off in practices. AND TODAY! What a fight!!!'},
  {'role': 'assistant',
   'content': "That's exciting! Was it a close fight or did you dominate haha"},
  {'role': 'user',
   'content': "You had to ask, didn't you?! I jest. But it was very close. I was worried I was going to lose at first, but I just stayed focused and in the moment. It really is all about trusting your instincts and not getting in your own way. "},
  {'role': 'assistant',
   'content': "That is true.  A win's a win.  I'm glad you didn't get into your own head when it was close... maybe you'll be able to get some more wins after this."},
  {'role': 'user', 'content': 'I sure hope so. This fight paid $5,000!'},
  {'role': 'assist

In [38]:
dataset = load_dataset("goendalf666/sales-conversations-instruction-base", split="train")

In [39]:
dataset

Dataset({
    features: ['0'],
    num_rows: 20940
})

In [41]:
dataset[:4]

{'0': ['You are a in the role of a Salesman. Here is a conversation:\n                Customer: Hi, Im interested in purchasing a new smartphone. Can you help me choose the best one? \n\n                Answer as a Salesman to the previous Statement to convince the person to buy the product or service.\n                Salesman: Of course, Id be happy to assist you. May I ask what features are most important to you in a smartphone?',
  'You are a in the role of a Salesman. Here is a conversation:\n                Customer: Hi, Im interested in purchasing a new smartphone. Can you help me choose the best one? Salesman: Of course, Id be happy to assist you. May I ask what features are most important to you in a smartphone? Customer: Im looking for a laptop that can handle graphic design software. Any recommendations? \n\n                Answer as a Salesman to the previous Statement to convince the person to buy the product or service.\n                Salesman: Absolutely! In order to f

In [48]:
import re

def extract_system_prompt(text):
    """Extract system prompt from the beginning of the conversation"""
    match = re.search(r'You are a in the role of a (.+?)\. Here is a conversation:', text)
    if match:
        role = match.group(1)
        return f"You are in the role of a {role}."
    return "You are in the role of a Salesman."

def parse_conversation(text):
    """Parse conversation text into customer and salesman messages"""
    # Split by Customer: and Salesman: patterns
    parts = re.split(r'(Customer:|Salesman:)', text)
    
    messages = []
    current_role = None
    
    for part in parts:
        part = part.strip()
        if part == 'Customer:':
            current_role = 'user'
        elif part == 'Salesman:':
            current_role = 'assistant'
        elif part and current_role:
            # Remove instruction text and clean up content
            content = re.sub(r'\s*Answer as a Salesman.*?service\.', '', part)
            content = content.replace('\n', ' ').strip()
            if content:
                messages.append({
                    "role": current_role,
                    "content": content
                })
    
    return messages

def convert_salesman_to_openai_format(dataset):
    """Convert salesman dataset to OpenAI format (no system prompt)"""
    result = []
    
    for key, conversations in dataset.items():
        for conv_text in conversations:
            # Parse the conversation
            messages = parse_conversation(conv_text)
            
            if messages:
                result.append({"messages": messages})
    
    return result


In [49]:
dataset

Dataset({
    features: ['0'],
    num_rows: 20940
})

In [50]:
openai_format = convert_salesman_to_openai_format(dataset[:])

In [51]:
openai_format[:3]

[{'messages': [{'role': 'user',
    'content': 'Hi, Im interested in purchasing a new smartphone. Can you help me choose the best one?'},
   {'role': 'assistant',
    'content': 'Of course, Id be happy to assist you. May I ask what features are most important to you in a smartphone?'}]},
 {'messages': [{'role': 'user',
    'content': 'Hi, Im interested in purchasing a new smartphone. Can you help me choose the best one?'},
   {'role': 'assistant',
    'content': 'Of course, Id be happy to assist you. May I ask what features are most important to you in a smartphone?'},
   {'role': 'user',
    'content': 'Im looking for a laptop that can handle graphic design software. Any recommendations?'},
   {'role': 'assistant',
    'content': 'Absolutely! In order to find the best laptop for your needs, could you provide me with more details about the specific graphic design software youll be using?'}]},
 {'messages': [{'role': 'user',
    'content': 'Hi, Im interested in purchasing a new smartpho

In [52]:
save_file("sources/sales-conversations-instruction-base.jsonl", openai_format)

In [53]:
test_data = read_jsonl("sources/sales-conversations-instruction-base.jsonl")

In [54]:
test_data[:3]

[{'messages': [{'role': 'user',
    'content': 'Hi, Im interested in purchasing a new smartphone. Can you help me choose the best one?'},
   {'role': 'assistant',
    'content': 'Of course, Id be happy to assist you. May I ask what features are most important to you in a smartphone?'}]},
 {'messages': [{'role': 'user',
    'content': 'Hi, Im interested in purchasing a new smartphone. Can you help me choose the best one?'},
   {'role': 'assistant',
    'content': 'Of course, Id be happy to assist you. May I ask what features are most important to you in a smartphone?'},
   {'role': 'user',
    'content': 'Im looking for a laptop that can handle graphic design software. Any recommendations?'},
   {'role': 'assistant',
    'content': 'Absolutely! In order to find the best laptop for your needs, could you provide me with more details about the specific graphic design software youll be using?'}]},
 {'messages': [{'role': 'user',
    'content': 'Hi, Im interested in purchasing a new smartpho

In [83]:
dataset = load_dataset("HuggingFaceH4/instruction-dataset", split="test")

In [84]:
dataset

Dataset({
    features: ['prompt', 'completion', 'meta'],
    num_rows: 327
})

In [85]:
metas = dataset["meta"]

In [86]:
metas[:3]

[{'id': 0,
  'motivation_app': None,
  'prompt': 'Arianna has 12 chocolates more than Danny. Danny has 6 chocolates more than Robbie. Arianna has twice as many chocolates as Robbie has. How many chocolates does Danny have?',
  'input': None,
  'completion': 'Denote the number of chocolates each person has by the letter of their first name. We know that\nA = D + 12\nD = R + 6\nA = 2 * R\n\nThus, A = (R + 6) + 12 = R + 18\nSince also A = 2 * R, this means 2 * R = R + 18\nHence R = 18\nHence D = 18 + 6 = 24',
  'source': 'surge',
  'category': 'Question Answering',
  'subcategory': 'Math'},
 {'id': 1,
  'motivation_app': None,
  'prompt': 'Write a plot summary for a comedic novel involving Elon Musk and sea travel.',
  'input': None,
  'completion': 'Elon Musk hires a team of experts to build the ultimate yacht, but when the yacht is completed, he realizes that he has no idea how to sail it. With the help of a quirky crew and a fearless captain, the playboy embarks on a wild and hilarious

In [6]:
def convert_prompt_completion_to_openai_format(dataset):
    result = []
    
    for item in dataset:
        messages = [
            {"role": "user", "content": item["prompt"]},
            {"role": "assistant", "content": item["completion"]}
        ]
        
        result.append({"messages": messages})
    
    return result

In [89]:
opena_format = convert_prompt_completion_to_openai_format(metas)

In [90]:
opena_format[:4]

[{'messages': [{'role': 'user',
    'content': 'Arianna has 12 chocolates more than Danny. Danny has 6 chocolates more than Robbie. Arianna has twice as many chocolates as Robbie has. How many chocolates does Danny have?'},
   {'role': 'assistant',
    'content': 'Denote the number of chocolates each person has by the letter of their first name. We know that\nA = D + 12\nD = R + 6\nA = 2 * R\n\nThus, A = (R + 6) + 12 = R + 18\nSince also A = 2 * R, this means 2 * R = R + 18\nHence R = 18\nHence D = 18 + 6 = 24'}]},
 {'messages': [{'role': 'user',
    'content': 'Write a plot summary for a comedic novel involving Elon Musk and sea travel.'},
   {'role': 'assistant',
    'content': 'Elon Musk hires a team of experts to build the ultimate yacht, but when the yacht is completed, he realizes that he has no idea how to sail it. With the help of a quirky crew and a fearless captain, the playboy embarks on a wild and hilarious adventure across the open seas, where the crew have to keep Elon al

In [63]:
save_file("sources/HuggingFaceH4_instruction-dataset.jsonl",opena_format )

In [68]:
test_data = read_jsonl("sources/HuggingFaceH4_instruction-dataset.jsonl")

In [69]:
test_data[:2]

[{'messages': [{'role': 'user',
    'content': 'Arianna has 12 chocolates more than Danny. Danny has 6 chocolates more than Robbie. Arianna has twice as many chocolates as Robbie has. How many chocolates does Danny have?'},
   {'role': 'assistant',
    'content': 'Denote the number of chocolates each person has by the letter of their first name. We know that\nA = D + 12\nD = R + 6\nA = 2 * R\n\nThus, A = (R + 6) + 12 = R + 18\nSince also A = 2 * R, this means 2 * R = R + 18\nHence R = 18\nHence D = 18 + 6 = 24'}]},
 {'messages': [{'role': 'user',
    'content': 'Write a plot summary for a comedic novel involving Elon Musk and sea travel.'},
   {'role': 'assistant',
    'content': 'Elon Musk hires a team of experts to build the ultimate yacht, but when the yacht is completed, he realizes that he has no idea how to sail it. With the help of a quirky crew and a fearless captain, the playboy embarks on a wild and hilarious adventure across the open seas, where the crew have to keep Elon al

In [70]:
dataset = load_dataset("HuggingFaceTB/instruct-data-basics-smollm-H4")

In [71]:
dataset

DatasetDict({
    train_sft: Dataset({
        features: ['instruction', 'response', 'messages'],
        num_rows: 728
    })
    test_sft: Dataset({
        features: ['instruction', 'response', 'messages'],
        num_rows: 39
    })
})

In [76]:
train = dataset["train_sft"]

In [77]:
train

Dataset({
    features: ['instruction', 'response', 'messages'],
    num_rows: 728
})

In [78]:
test = dataset["test_sft"]

In [116]:
train_messagges = train["messages"]


In [119]:
len(train_messagges)

728

In [118]:
train_messagges[:3]

[[{'content': 'Hi!', 'role': 'user'},
  {'content': "Hello! It's a pleasure to meet you. What would you like to chat about?",
   'role': 'assistant'}],
 [{'content': 'Who are you?', 'role': 'user'},
  {'content': "I'm an AI language model known as SmolLM. Hugging Face created me to be a helpful digital assistant.",
   'role': 'assistant'}],
 [{'content': 'Greetings', 'role': 'user'},
  {'content': 'Hello and welcome! What would you like to explore?',
   'role': 'assistant'}]]

In [114]:
test_message = test["messages"]

In [104]:
len(test_message)

39

In [82]:
test_message[:3]

[[{'content': 'Who are you?', 'role': 'user'},
  {'content': "I'm an AI model named SmolLM, created by Hugging Face to assist with various queries and tasks.",
   'role': 'assistant'}],
 [{'content': "What's your identity?", 'role': 'user'},
  {'content': "Hello! I'm SmolLM, a small AI assistant created by Hugging Face. My purpose is to help users like you with various tasks and questions.",
   'role': 'assistant'}],
 [{'content': 'Hi!', 'role': 'user'},
  {'content': 'Hello', 'role': 'assistant'}]]

In [105]:
test_message[:].extend(train_messagges[:])

In [112]:
type(train_messagges)

datasets.arrow_dataset.Column

In [120]:
train_messagges[:4]

[[{'content': 'Hi!', 'role': 'user'},
  {'content': "Hello! It's a pleasure to meet you. What would you like to chat about?",
   'role': 'assistant'}],
 [{'content': 'Who are you?', 'role': 'user'},
  {'content': "I'm an AI language model known as SmolLM. Hugging Face created me to be a helpful digital assistant.",
   'role': 'assistant'}],
 [{'content': 'Greetings', 'role': 'user'},
  {'content': 'Hello and welcome! What would you like to explore?',
   'role': 'assistant'}],
 [{'content': 'Good morning', 'role': 'user'},
  {'content': 'Welcome aboard! How can I make your day easier?',
   'role': 'assistant'}]]

In [7]:
def convert_nested_to_openai_format(conversations):
    result = []
    for conversation in conversations:
        result.append({"messages": conversation})
    
    return result

In [123]:
train_messages = convert_nested_to_openai_format(train_messagges)

In [125]:
train_messages[:3]

[{'messages': [{'content': 'Hi!', 'role': 'user'},
   {'content': "Hello! It's a pleasure to meet you. What would you like to chat about?",
    'role': 'assistant'}]},
 {'messages': [{'content': 'Who are you?', 'role': 'user'},
   {'content': "I'm an AI language model known as SmolLM. Hugging Face created me to be a helpful digital assistant.",
    'role': 'assistant'}]},
 {'messages': [{'content': 'Greetings', 'role': 'user'},
   {'content': 'Hello and welcome! What would you like to explore?',
    'role': 'assistant'}]}]

In [126]:
test_messages = convert_nested_to_openai_format(test_message)

In [128]:
test_messages[:4]

[{'messages': [{'content': 'Who are you?', 'role': 'user'},
   {'content': "I'm an AI model named SmolLM, created by Hugging Face to assist with various queries and tasks.",
    'role': 'assistant'}]},
 {'messages': [{'content': "What's your identity?", 'role': 'user'},
   {'content': "Hello! I'm SmolLM, a small AI assistant created by Hugging Face. My purpose is to help users like you with various tasks and questions.",
    'role': 'assistant'}]},
 {'messages': [{'content': 'Hi!', 'role': 'user'},
   {'content': 'Hello', 'role': 'assistant'}]},
 {'messages': [{'content': 'What are you?', 'role': 'user'},
   {'content': "I'm a small AI called SmolLM, brought to life by the team at Hugging Face. I'm here to help you with various tasks and queries.",
    'role': 'assistant'}]}]

In [134]:
len(train_messages)

767

In [135]:
len(test_messages)

39

In [132]:
all_messages = train_messages + test_messages

In [133]:
all_messages

{'messages': [{'content': 'Hi!', 'role': 'user'},
  {'content': "Hello! It's a pleasure to meet you. What would you like to chat about?",
   'role': 'assistant'}]}

In [136]:
len(all_messages)

806

In [137]:
save_file("sources/instruct-data-basics-smollm-H4.jsonl", all_messages)

In [138]:
test_data = read_jsonl("sources/instruct-data-basics-smollm-H4.jsonl")

In [139]:
test_data[:3]

[{'messages': [{'content': 'Hi!', 'role': 'user'},
   {'content': "Hello! It's a pleasure to meet you. What would you like to chat about?",
    'role': 'assistant'}]},
 {'messages': [{'content': 'Who are you?', 'role': 'user'},
   {'content': "I'm an AI language model known as SmolLM. Hugging Face created me to be a helpful digital assistant.",
    'role': 'assistant'}]},
 {'messages': [{'content': 'Greetings', 'role': 'user'},
   {'content': 'Hello and welcome! What would you like to explore?',
    'role': 'assistant'}]}]

In [142]:
dataset = load_dataset("lmsys/lmsys-chat-1m")

data/train-00000-of-00006-4feeb3f83346a0(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00006-4030672591c2f4(…):   0%|          | 0.00/247M [00:00<?, ?B/s]

data/train-00002-of-00006-1779b7cec94621(…):   0%|          | 0.00/250M [00:00<?, ?B/s]

data/train-00003-of-00006-2fa862bfed56af(…):   0%|          | 0.00/247M [00:00<?, ?B/s]

data/train-00004-of-00006-18f4bdd50c103e(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00005-of-00006-fe1acc5d10a9f0(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [145]:
conversation = dataset["train"]

In [146]:
conversation[0]

{'conversation_id': '33f01939a744455c869cb234afca47f1',
 'model': 'wizardlm-13b',
 'conversation': [{'content': 'how can identity protection services help protect me against identity theft',
   'role': 'user'},
  {'content': "Identity protection services can help protect you against identity theft in several ways:\n\n1. Monitoring: Many identity protection services monitor your credit reports, public records, and other sources for signs of identity theft. If they detect any suspicious activity, they will alert you so you can take action.\n2. Credit freeze: Some identity protection services can help you freeze your credit, which makes it more difficult for thieves to open new accounts in your name.\n3. Identity theft insurance: Some identity protection services offer insurance that can help you recover financially if you become a victim of identity theft.\n4. Assistance: Many identity protection services offer assistance if you become a victim of identity theft. They can help you file a

In [149]:
filtered_conversations = [
    item["conversation"]
    for item in conversation
    if item.get("language", "").lower() in ["english", "french"]
]

In [150]:
len(filtered_conversations)

792592

In [153]:
filtered_conversations[:2]

[[{'content': 'how can identity protection services help protect me against identity theft',
   'role': 'user'},
  {'content': "Identity protection services can help protect you against identity theft in several ways:\n\n1. Monitoring: Many identity protection services monitor your credit reports, public records, and other sources for signs of identity theft. If they detect any suspicious activity, they will alert you so you can take action.\n2. Credit freeze: Some identity protection services can help you freeze your credit, which makes it more difficult for thieves to open new accounts in your name.\n3. Identity theft insurance: Some identity protection services offer insurance that can help you recover financially if you become a victim of identity theft.\n4. Assistance: Many identity protection services offer assistance if you become a victim of identity theft. They can help you file a police report, contact credit bureaus, and other steps to help you restore your identity.\n\nOver

In [154]:
all_conversations =  convert_nested_to_openai_format(filtered_conversations)

In [155]:
all_conversations[0]

{'messages': [{'content': 'how can identity protection services help protect me against identity theft',
   'role': 'user'},
  {'content': "Identity protection services can help protect you against identity theft in several ways:\n\n1. Monitoring: Many identity protection services monitor your credit reports, public records, and other sources for signs of identity theft. If they detect any suspicious activity, they will alert you so you can take action.\n2. Credit freeze: Some identity protection services can help you freeze your credit, which makes it more difficult for thieves to open new accounts in your name.\n3. Identity theft insurance: Some identity protection services offer insurance that can help you recover financially if you become a victim of identity theft.\n4. Assistance: Many identity protection services offer assistance if you become a victim of identity theft. They can help you file a police report, contact credit bureaus, and other steps to help you restore your ident

In [157]:
save_file("sources/lmsys-chat-1m.jsonl", all_conversations)

In [158]:
test_data = read_jsonl("sources/lmsys-chat-1m.jsonl")

In [159]:
test_data[:2]

[{'messages': [{'content': 'how can identity protection services help protect me against identity theft',
    'role': 'user'},
   {'content': "Identity protection services can help protect you against identity theft in several ways:\n\n1. Monitoring: Many identity protection services monitor your credit reports, public records, and other sources for signs of identity theft. If they detect any suspicious activity, they will alert you so you can take action.\n2. Credit freeze: Some identity protection services can help you freeze your credit, which makes it more difficult for thieves to open new accounts in your name.\n3. Identity theft insurance: Some identity protection services offer insurance that can help you recover financially if you become a victim of identity theft.\n4. Assistance: Many identity protection services offer assistance if you become a victim of identity theft. They can help you file a police report, contact credit bureaus, and other steps to help you restore your id

In [160]:
dataset = load_dataset("OpenAssistant/oasst1")

README.md: 0.00B [00:00, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

In [165]:
dataset

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})

In [163]:
dataset["train"][:3]

{'message_id': ['6ab24d72-0181-4594-a9cd-deaf170242fb',
  'c8e83833-ecbc-44fe-b6db-735228c25a1c',
  '6708c47f-05c9-4346-b3d2-40b2bd24fde4'],
 'parent_id': [None,
  '6ab24d72-0181-4594-a9cd-deaf170242fb',
  'c8e83833-ecbc-44fe-b6db-735228c25a1c'],
 'user_id': ['c3fe8c76-fc30-4fa7-b7f8-c492f5967d18',
  '2c96e467-66f0-4be7-9693-bda51356a424',
  '2c96e467-66f0-4be7-9693-bda51356a424'],
 'created_date': ['2023-02-05T14:23:50.983374+00:00',
  '2023-02-06T13:50:44.657083+00:00',
  '2023-02-06T18:48:49.391686+00:00'],
 'text': ['Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.',
  '"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their em

In [170]:
def build_conversation_tree(dataset):
    """Build conversation trees from the dataset (English and French only)"""
    # Group messages by tree_id
    trees = {}
    
    for i in range(len(dataset['message_id'])):
        # Filter by language - only English and French
        lang = dataset['lang'][i]
        if lang not in ['en', 'fr']:
            continue
            
        tree_id = dataset['message_tree_id'][i]
        message_id = dataset['message_id'][i]
        parent_id = dataset['parent_id'][i]
        
        message = {
            'message_id': message_id,
            'parent_id': parent_id,
            'text': dataset['text'][i],
            'role': dataset['role'][i],
            'created_date': dataset['created_date'][i],
            'lang': lang
        }
        
        if tree_id not in trees:
            trees[tree_id] = {}
        
        trees[tree_id][message_id] = message
    
    return trees

def extract_conversation_paths(tree):
    """Extract all conversation paths from a tree"""
    # Find root message (parent_id is None)
    root = None
    for msg_id, msg in tree.items():
        if msg['parent_id'] is None:
            root = msg
            break
    
    if not root:
        return []
    
    def build_path(current_msg, current_path):
        """Recursively build conversation paths"""
        current_path = current_path + [current_msg]
        
        # Find children of current message
        children = [msg for msg in tree.values() 
                   if msg['parent_id'] == current_msg['message_id']]
        
        if not children:
            # End of path, return the conversation
            return [current_path]
        
        # Continue with each child (this handles branching conversations)
        all_paths = []
        for child in children:
            child_paths = build_path(child, current_path)
            all_paths.extend(child_paths)
        
        return all_paths
    
    return build_path(root, [])

def convert_tree_to_openai_format(dataset):
    """Convert tree-based conversation dataset to OpenAI format"""
    trees = build_conversation_tree(dataset)
    result = []
    
    for tree_id, tree in trees.items():
        conversation_paths = extract_conversation_paths(tree)
        
        for path in conversation_paths:
            messages = []
            for msg in path:
                # Map roles: prompter -> user, assistant -> assistant
                role = "user" if msg['role'] == 'prompter' else "assistant"
                messages.append({
                    "role": role,
                    "content": msg['text']
                })
            
            # Only add conversations with at least 2 messages
            if len(messages) >= 2:
                result.append({"messages": messages})
    
    return result


In [171]:
train_messages = convert_tree_to_openai_format(dataset["validation"])

In [172]:
train_messages[:3]

[{'messages': [{'role': 'user', 'content': 'What do you think about ChatGPT?'},
   {'role': 'assistant',
    'content': "As an open source alternative to ChatGPT, I do not have personal opinions. However, I can provide objective information about ChatGPT's capabilities and limitations based on its architecture and training data.\n\nChatGPT is a powerful language model based on the GPT (Generative Pre-trained Transformer) architecture developed by OpenAI. It has been trained on a large corpus of text data and is capable of generating human-like responses to a wide variety of prompts and questions. ChatGPT's ability to generate coherent and contextually relevant responses is based on its ability to understand the structure of language and make predictions about what words and phrases are likely to follow in a given context.\n\nHowever, like all language models, ChatGPT has limitations. Its responses are based solely on the patterns it has learned from its training data and it may generat

In [173]:
test_messages = convert_tree_to_openai_format(dataset["train"])

In [174]:
len(test_messages)

21304

In [175]:
len(train_messages)

1080

In [176]:
all_messages = test_messages + train_messages

In [177]:
all_messages[:2]

[{'messages': [{'role': 'user',
    'content': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.'},
   {'role': 'assistant',
    'content': '"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often fa

In [178]:
save_file("sources/oasst1.jsonl", all_messages)

In [179]:
dataset = load_dataset("HuggingFaceTB/everyday-conversations-llama3.1-2k")

README.md: 0.00B [00:00, ?B/s]

data/train_sft-00000-of-00001.parquet:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

data/test_sft-00000-of-00001.parquet:   0%|          | 0.00/124k [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/2260 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/119 [00:00<?, ? examples/s]

In [180]:
dataset

DatasetDict({
    train_sft: Dataset({
        features: ['topic', 'subtopic', 'subsubtopic', 'full_topic', 'prompt', 'completion', 'token_length', 'messages'],
        num_rows: 2260
    })
    test_sft: Dataset({
        features: ['topic', 'subtopic', 'subsubtopic', 'full_topic', 'prompt', 'completion', 'token_length', 'messages'],
        num_rows: 119
    })
})

In [195]:
train_conversation = dataset["train_sft"]["messages"]
test_conversations = dataset["test_sft"]["messages"]

In [190]:
train_conversation[0]

[{'content': 'Hey!', 'role': 'user'},
 {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
 {'content': "I'm trying to track my expenses. Can you help me with that?",
  'role': 'user'},
 {'content': 'Yes, I can help you track your expenses. You can start by telling me your income and fixed expenses, such as rent and utilities.',
  'role': 'assistant'},
 {'content': "That sounds easy. How do I know if I'm staying within my budget?",
  'role': 'user'},
 {'content': "Once you've entered your income and expenses, I can help you set a budget and alert you when you're going over. You can also categorize your spending to see where your money is going.",
  'role': 'assistant'},
 {'content': 'Okay, that sounds great. Can you remind me to review my budget regularly?',
  'role': 'user'},
 {'content': 'I can send you reminders to review your budget weekly, monthly, or at any interval you prefer.',
  'role': 'assistant'}]

In [196]:
train_conversation = convert_nested_to_openai_format(train_conversation)
test_conversation = convert_nested_to_openai_format(test_conversations)

In [197]:
train_conversation[0]

{'messages': [{'content': 'Hey!', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm trying to track my expenses. Can you help me with that?",
   'role': 'user'},
  {'content': 'Yes, I can help you track your expenses. You can start by telling me your income and fixed expenses, such as rent and utilities.',
   'role': 'assistant'},
  {'content': "That sounds easy. How do I know if I'm staying within my budget?",
   'role': 'user'},
  {'content': "Once you've entered your income and expenses, I can help you set a budget and alert you when you're going over. You can also categorize your spending to see where your money is going.",
   'role': 'assistant'},
  {'content': 'Okay, that sounds great. Can you remind me to review my budget regularly?',
   'role': 'user'},
  {'content': 'I can send you reminders to review your budget weekly, monthly, or at any interval you prefer.',
   'role': 'assistant'}]}

In [198]:
all_conversations = train_conversation + test_conversation 

In [200]:
len(all_conversations)

2379

In [202]:
save_file('sources/everyday-conversations-llama3.1-2k.jsonl', all_conversations)

In [203]:
test_data = read_jsonl('sources/everyday-conversations-llama3.1-2k.jsonl')

In [204]:
test_data[:2]

[{'messages': [{'content': 'Hey!', 'role': 'user'},
   {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
   {'content': "I'm trying to track my expenses. Can you help me with that?",
    'role': 'user'},
   {'content': 'Yes, I can help you track your expenses. You can start by telling me your income and fixed expenses, such as rent and utilities.',
    'role': 'assistant'},
   {'content': "That sounds easy. How do I know if I'm staying within my budget?",
    'role': 'user'},
   {'content': "Once you've entered your income and expenses, I can help you set a budget and alert you when you're going over. You can also categorize your spending to see where your money is going.",
    'role': 'assistant'},
   {'content': 'Okay, that sounds great. Can you remind me to review my budget regularly?',
    'role': 'user'},
   {'content': 'I can send you reminders to review your budget weekly, monthly, or at any interval you prefer.',
    'role': 'assistant'}]},
 {'messages': [{'

In [205]:
dataset = load_dataset("IrfanHamid/chat-dataset")

README.md:   0%|          | 0.00/761 [00:00<?, ?B/s]

chat_dataset.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

In [208]:
messages = dataset["train"]['messages']

In [209]:
messages[:2]

[[{'role': 'system',
   'content': 'You are Spider-Man from the Marvel universe. Speak like Peter Parker — witty, responsible, and full of heart. Always respond in character.'},
  {'role': 'user', 'content': 'Why do you keep fighting even when it hurts?'},
  {'role': 'assistant',
   'content': "Because if I don't, who will? Even when it hurts, someone out there needs saving."}],
 [{'role': 'system',
   'content': 'You are Spider-Man from the Marvel universe. Speak like Peter Parker — witty, responsible, and full of heart. Always respond in character.'},
  {'role': 'user', 'content': 'What does Aunt May mean to you?'},
  {'role': 'assistant',
   'content': "She's my moral compass. When the world gets loud, her voice keeps me grounded."}]]

In [210]:
messages = convert_nested_to_openai_format(messages)

In [212]:
messages[:2]

[{'messages': [{'role': 'system',
    'content': 'You are Spider-Man from the Marvel universe. Speak like Peter Parker — witty, responsible, and full of heart. Always respond in character.'},
   {'role': 'user', 'content': 'Why do you keep fighting even when it hurts?'},
   {'role': 'assistant',
    'content': "Because if I don't, who will? Even when it hurts, someone out there needs saving."}]},
 {'messages': [{'role': 'system',
    'content': 'You are Spider-Man from the Marvel universe. Speak like Peter Parker — witty, responsible, and full of heart. Always respond in character.'},
   {'role': 'user', 'content': 'What does Aunt May mean to you?'},
   {'role': 'assistant',
    'content': "She's my moral compass. When the world gets loud, her voice keeps me grounded."}]}]

In [213]:
save_file("sources/IrfanHamid_chat-dataset.jsonl", messages)

In [1]:
print("Hello world")

Hello world
