In [1]:
import concurrent.futures
import json
import re
from typing import List, Tuple
from datasets import Dataset
from openai import OpenAI
from tqdm.auto import tqdm

In [2]:
class PreferenceSet:
    def __init__(self, triples: List[Tuple[str, str, str]]):
        self.triples = triples
    
    @classmethod
    def from_json(cls, json_str: str) -> 'PreferenceSet':
        data = json.loads(json_str)
        triples = [(triple['instruction'], triple['generated_answer'], triple['extracted_answer']) for triple in data['preference_triples']]
        return cls(triples)
    
    def __iter__(self):
        return iter(self.triples)

In [3]:
def load_articles_from_json(file_path: str) -> Dataset:
    with open(file_path, "r") as file:
        data = json.load(file)
    return Dataset.from_dict(
        {
        # "id": [item["id"] for item in data["artifact_data"]],
        # "content": [item["content"] for item in data["artifact_data"]],
        # "platform": [item["platform"] for item in data["artifact_data"]],
        # "author_id": [item["author_id"] for item in data["artifact_data"]],
        # "author_full_name": [item["author_full_name"] for item in data["artifact_data"]],
        # "link": [item["link"] for item in data["artifact_data"]],
        "id": [dat['_id'] for dat in data],
        "content": [dat['content'] for dat in data],
        "platform": [dat['platform'] for dat in data],
        "author_id": [dat['author_id'] for dat in data],
        "author_full_name": [dat['author_full_name'] for dat in data],
        "link": [dat['link'] for dat in data],

        }
    )

In [11]:
def clean_text(text):
    text = re.sub(r"[^\w\s.,!?']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [13]:
def extract_substrings(dataset: Dataset, min_length: int = 1000, max_length: int = 2000) -> List[str]:
    extracts = []
    sentence_pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"
    # print(dataset)
    for article in dataset["content"]:
        cleaned_article = clean_text(article['Content'])
        sentences = re.split(sentence_pattern, cleaned_article)
        current_chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            if len(current_chunk) + len(sentence) <= max_length:
                current_chunk += sentence + " "
            else:
                if len(current_chunk) >= min_length:
                    extracts.append(current_chunk.strip())
                current_chunk = sentence + " "
        if len(current_chunk) >= min_length:
            extracts.append(current_chunk.strip())
    return extracts

In [6]:
def generate_preference_triples(extract: str, client: OpenAI) -> List[Tuple[str, str, str]]:
    prompt = f"""
    Based on the following extract, generate five instruction-answer triples. Each triple should consist of:
        1. An instruction asking about a specific topic in the context.
        2. A generated answer that attempts to answer the instruction based on the context.
        3. An extracted answer that is a relevant excerpt directly from the given context.
    
    Instructions must be self-contained and general, without explicitly mentioning a context, system, course, or extract.
    
    Important:
    - Ensure that the extracted answer is a verbatim copy from the context, including all punctuation and apostrophes.
    - Do not add any ellipsis (...) or [...]  to indicate skipped text in the extracted answer.
    - If the relevant text is not continuous, use two separate sentences from the context instead of skipping text.
    
    Provide your response in JSON format with the following structure:
    {{
        "preference_triples": [
            {{
                "instruction": "...",
                "generated_answer": "...",
                "extracted_answer": "..."
            }},
            ...
        ]
    }}

    Extract:
    {extract}
    """
    completion = client.chat.completions.create(model="gpt-4o-mini", messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant who generates instruction-answer triples based on the given context. "
            "           Each triple should include an instruction, a generated answer, and an extracted answer from the context. Provide your response in JSON format.",
            
        },
            
        {
            "role": "user", 
            "content": prompt
         },
        ],
        response_format={"type": "json_object"},
        max_tokens=2000,
        temperature=0.7,
    )
    result = PreferenceSet.from_json(completion.choices[0].message.content)
    
    return result.triples


In [7]:
def filter_short_answers(dataset: Dataset, min_length: int = 100) -> Dataset:
    def is_long_enough(example):
        return len(example['chosen']) >= min_length
    
    return dataset.filter(is_long_enough)

def filter_answer_format(dataset: Dataset) -> Dataset:
    def is_valid_format(example):
        chosen = example['chosen']
        return (len(chosen) > 0 and
                chosen[0].isupper() and
            chosen[-1] in ('.', '!', '?'))
    
    return dataset.filter(is_valid_format)

In [8]:
def create_preference_dataset(dataset: Dataset, client: OpenAI, num_workers: int = 4) -> Dataset:
    extracts = extract_substrings(dataset)
    preference_triples = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [
            executor.submit(generate_preference_triples, extract, client)
            for extract in extracts
        ]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
           preference_triples.extend(future.result())
           instructions, generated_answers, extracted_answers = zip(*preference_triples)
    
    return Dataset.from_dict(
        {
            "prompt": list(instructions),
            "rejected": list(generated_answers),
            "chosen": list(extracted_answers)
        }
    )

In [9]:
# def main(dataset_id: str) -> Dataset:
client = OpenAI()

# 1. Load the raw data
raw_dataset = load_articles_from_json("D:\Project\MirrorMuse\data\data_warehouse_raw_data\ArticleDocument.json")
print("Raw dataset:")
print(raw_dataset.to_pandas())
    
    # return dataset

  raw_dataset = load_articles_from_json("D:\Project\MirrorMuse\data\data_warehouse_raw_data\ArticleDocument.json")


Raw dataset:
                                      id  \
0   01ce4881-1ec9-40d4-85c8-c4626d302094   
1   9d2718e9-f3d7-4d89-9ee8-3d11bca693bc   
2   f2877873-ed13-434d-9b6c-a8f50f47bdc8   
3   ceb1afa3-dc45-4bb3-afc0-08286827b08f   
4   3e3e824a-6442-49f7-b26b-b7a538c24335   
..                                   ...   
71  ef25f83c-dd53-416e-87f1-29c54ce3f4f4   
72  6c0c601d-17ac-4cdf-afc1-740eb7355826   
73  f73f6263-9456-4d31-be47-b5ae3ac6dfa6   
74  22958dc9-c43a-4992-839f-a82e8be0fcda   
75  396a112a-8bbd-48fe-b2f9-9402016bdc8d   

                                              content  \
0   {'Content': 'Maxime Labonne

  * __LLM Course
...   
1   {'Content': 'Maxime Labonne

  * __LLM Course
...   
2   {'Content': '# Maxime Labonne

SubscribeSign i...   
3   {'Content': '# Maxime Labonne

SubscribeSign i...   
4   {'Content': '# Maxime Labonne

SubscribeSign i...   
..                                                ...   
71  {'Content': '#

SubscribeSign in

#### Share t...   
72

In [14]:

# 2. Create preference dataset
dataset = create_preference_dataset(raw_dataset, client)
print("Preference dataset:")
print(dataset.to_pandas())

  0%|          | 0/422 [00:00<?, ?it/s]

Preference dataset:
                                                 prompt  \
0     What is recommended for new domains unknown to...   
1     What are the three popular SFT techniques ment...   
2                   What does full fine tuning involve?   
3           How does LoRA differ from full fine tuning?   
4     What advantage does QLoRA provide over standar...   
...                                                 ...   
2105  What tools does the author primarily use for p...   
2106  What is the process the author follows to hand...   
2107  What does the author refer to as 'The Warehouse'?   
2108            What free course does the author offer?   
2109  How does the author feel about using too many ...   

                                               rejected  \
0     It is recommended to continuously pre-train th...   
1     The three popular SFT techniques are full fine...   
2     Full fine tuning involves retraining all param...   
3     LoRA freezes the weights and 

In [15]:
# 3. Filter out samples with short answers
dataset = filter_short_answers(dataset)

Filter:   0%|          | 0/2110 [00:00<?, ? examples/s]

In [16]:
# 4. Filter answers based on format
dataset = filter_answer_format(dataset)

Filter:   0%|          | 0/1276 [00:00<?, ? examples/s]

In [18]:
# 5. Export
dataset.push_to_hub("SkillRipper/preference-data")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SkillRipper/preference-data/commit/33a1c0e199906dc1cdeb38f7392477949ae6facc', commit_message='Upload dataset', commit_description='', oid='33a1c0e199906dc1cdeb38f7392477949ae6facc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SkillRipper/preference-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SkillRipper/preference-data'), pr_revision=None, pr_num=None)