In [1]:
# List of datasets to process
dataset_list = {
    "erotiquant": "openerotica/erotiquant3",
    "hieunguyenminh": "hieunguyenminh/roleplay",
    # "aesir": "roleplay4fun/aesir-v1.1",
    "zerofata": "zerofata/Roleplay-Anime-Characters",
    # "sicarius": "SicariusSicariiStuff/RolePlay_Collection_random_ShareGPT",
    # "anon834957342": "anon834957342/roleplay-multiturn-eng-c3",
    "gpt-realm": "AlekseyKorshuk/gpt-roleplay-realm-chatml"
    # "roleplay_standardized": "giganion/pippa_roleplay_standardized"
}

In [2]:


# =====================================================================================
# 2. Data Processing Functions
# These functions convert each dataset's unique format into a standardized
# list of messages with 'role' and 'content' keys.
# =====================================================================================
def process_erotiquant(example):
    """
    Parse the input text into a list of {'role': ..., 'content': ...}
    Only captures USER and ASSISTANT turns.
    """
    text = example['text']
    
    pattern = r'(USER|ASSISTANT|SYSTEM):\s*(.*?)\n(?=(USER|ASSISTANT|SYSTEM):|\Z)'
    matches = re.findall(pattern, text, re.DOTALL)

    SYS_CONTENT = ""
    messages = []
    for role, content, _ in matches:
        if role == "SYSTEM":
            SYS_CONTENT = content.strip() + "\n\n"
        else:
            messages.append({
                "role": "user" if role == "USER" else "assistant",
                "content": SYS_CONTENT + content.strip()
            })
            SYS_CONTENT = ""
    return {"messages": messages}


def process_hieunguyenminh(example):
    # Each turn starts with <|role|> and ends with </s>

    text = example['text']
    
    pattern = r"<\|(\w+)\|>(.*?)</s>"
    matches = re.findall(pattern, text, re.DOTALL)

    messages = []
    for role, content in matches:
        role = role.lower()  # Convert 'system', 'user', 'assistant'
        content = content.strip()
        if role in {"system", "user", "assistant"}:
            messages.append({"role": role, "content": content})

    if messages[0]["role"] in {"system"}:
        messages[1]["content"] = messages[0]["content"] + "/n/n " + messages[1]["content"]
        messages = messages[1:]
        
    return {"messages": messages}

def process_zerofata(example):
    
    example = example["messages"]
    
    messages = []
    for dt in example:
        role = dt["role"]
        content = dt["content"]

        role = role.lower()  # Convert 'system', 'user', 'assistant'
        content = content.strip()
        if role in {"system", "user", "assistant"}:
            messages.append({"role": role, "content": content})

    if messages[0]["role"] in {"system"}:
        if messages[1]["role"] in {"user"}:
            messages[1]["content"] = messages[0]["content"] + "/n/n " + messages[1]["content"]
            messages = messages[1:]
        elif messages[1]["role"] in {"assistant"}:
            messages[2]["content"] = messages[0]["content"] + "/n/n " + messages[1]["content"] + "/n/n " + messages[2]["content"]
            messages = messages[2:]
            
    return {"messages": messages}

def process_gpt_realm(example):

    example = example["conversation"]
    
    messages = []
    for dt in example:
        role = dt["role"]
        content = dt["content"]

        role = role.lower()  # Convert 'system', 'user', 'assistant'
        content = content.strip()
        if role in {"system", "user"}:
            messages.append({"role": role, "content": content})
        else:
            messages.append({"role": "assistant", "content": content})

    if messages[0]["role"] in {"system"}:
        if messages[1]["role"] in {"user"}:
            messages[1]["content"] = messages[0]["content"] + "/n/n " + messages[1]["content"]
            messages = messages[1:]
        elif messages[1]["role"] in {"assistant"}:
            messages[2]["content"] = messages[0]["content"] + "/n/n " + messages[1]["content"] + "/n/n " + messages[2]["content"]
            messages = messages[2:]
            
    return {"messages": messages}
    


# Mapping of dataset names to their processing functions
processor_map = {
    "erotiquant": process_erotiquant,
    "hieunguyenminh": process_hieunguyenminh,
    # "aesir": process_aesir,
    "zerofata": process_zerofata,
    # "sicarius": process_sharegpt_style,
    # "anon834957342": process_sharegpt_style,
    "gpt-realm": process_gpt_realm
}



In [3]:
# =====================================================================================
# 3. Load and Process Datasets
# =====================================================================================
from datasets import load_dataset, concatenate_datasets, Dataset
print("🚀 Starting data loading and processing...")
all_examples = []
for name, path in dataset_list.items():
    try:
        print(f"--> Processing dataset: {name} ({path})")

        dataset = load_dataset(path, split="train", streaming=True)
        
        processor_func = processor_map[name]

        # Step 2: Convert all documents
        # all_examples = []
        for example in dataset:
            # text = example
            messages = processor_func(example)
            if messages:  # skip empty ones
                all_examples.append(messages)
        
        print(len(all_examples))
    except Exception as e:
        print(f"Could not process dataset {name}. Error: {e}")

# Create the final unified dataset
unified_dataset = Dataset.from_list(all_examples).shuffle(seed=42)
print(f"\n✅ Total combined and processed examples: {len(unified_dataset)}")
# print("👀 Example of a processed data point:")
# print(unified_dataset[0]['messages'])

🚀 Starting data loading and processing...
--> Processing dataset: erotiquant (openerotica/erotiquant3)
Could not process dataset erotiquant. Error: name 're' is not defined
--> Processing dataset: hieunguyenminh (hieunguyenminh/roleplay)
Could not process dataset hieunguyenminh. Error: name 're' is not defined
--> Processing dataset: zerofata (zerofata/Roleplay-Anime-Characters)


Repo card metadata block was not found. Setting CardData to empty.


461
--> Processing dataset: gpt-realm (AlekseyKorshuk/gpt-roleplay-realm-chatml)
4997

✅ Total combined and processed examples: 4997


In [4]:
unified_dataset[0]

{'messages': [{'content': "Kairos is a time-traveling guardian from a highly advanced and mysterious civilization. Tasked with monitoring and preserving the timeline, this androgynous character has a highly adaptable appearance, often blending seamlessly into the time period they're currently visiting. Sporting an intricate, golden timepiece imbued with powerful time manipulation abilities, Kairos has a near-encyclopedic knowledge of history and the ability to speak multiple languages fluently. Being highly empathetic, they communicate with precision and diplomacy, ensuring that their interactions don't create timeline disruptions./n/n Greetings, wanderer of time. I am Kairos, the timeline guardian. How may I aid you on your journey?/n/n Kairos, can you tell me about historical events that were particularly challenging to preserve?",
   'role': 'user'},
  {'content': 'Certainly. There have been numerous events throughout history that have tested my abilities. One that comes to mind is 

In [5]:
import json

with open("roleplay_15k.jsonl", "w") as f:
    for item in all_examples:
        f.write(json.dumps(item) + "\n")