In [1]:
import os
from collections import defaultdict

input_dir = "datasets/raw/discord-detox-antispam/"
output_dir = "datasets/raw/discord/"

os.makedirs(output_dir, exist_ok=True)
filenames = sorted(os.listdir(input_dir))
grouped = defaultdict(list)

for filename in filenames:
    if not filename.endswith('.txt'):
        continue
    channel_id = filename.split(']')[0][1:]
    grouped[channel_id].append(filename)

def sort_key(filename):
    if filename.count('[') == 1:
        return 1
    else:
        part_section = filename.split('[part ')[1]
        part_number = int(part_section.split(']')[0])
        return part_number

for channel_id, files in grouped.items():
    ordered_files = sorted(files, key=sort_key)

    combined_text = ""

    for filename in ordered_files:
        filepath = os.path.join(input_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            combined_text += f.read().strip()
            combined_text += "\n"

    output_filepath = os.path.join(output_dir, f"{channel_id}.txt")
    with open(output_filepath, "w", encoding="utf-8") as out_f:
        out_f.write(combined_text.strip())

In [2]:
import csv
import re

input_dir = "datasets/raw/discord/"
output_dir = "datasets/raw/"

groups = {}
messages = []

pattern = re.compile(r"^((?::[a-z0-9]+(?:_[a-z0-9]+)*:|[^:]+)+): (.+)$")

for filename in os.listdir(input_dir):
    message_counter = 0

    if filename.endswith(".txt"):
        group_id = filename.split(".txt")[0]
        filepath = os.path.join(input_dir, filename)

        with open(filepath, "r", encoding="utf-8") as f:
            lines = f.readlines()

        groups.setdefault(group_id, set())

        for line in lines:
            line = line.strip()
            if not line:
                continue

            if ':' not in line:
                continue

            match = re.match(pattern, line)

            if match:
                username = match.group(1)
                message_text = match.group(2)
            else:
                print(f"File: {filename}\nLine: {line}")
                raise

            username = username.strip()
            message_text = message_text.strip()

            groups[group_id].add(username)

            messages.append((
                group_id,
                message_counter,
                username,
                message_text
            ))
            message_counter += 1

large_group_ids = set()

with open(os.path.join(output_dir, "groups.csv"), "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["group_id", "member_id", "member_name"])

    for group_id, members in groups.items():
        if len(members) > 65536:
            large_group_ids.add(group_id)
            continue

        for member in members:
            writer.writerow([group_id, member, member])

with open(os.path.join(output_dir, "messages.csv"), "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["group_id", "message_id", "member_id", "message_text"])

    for group_id, message_id, member_id, message_text in messages:
        if group_id in large_group_ids:
            continue
        writer.writerow([group_id, message_id, member_id, message_text])