# 从huggingface加载数据集

In [None]:
from datasets import load_dataset
# 数据集地址 https://huggingface.co/datasets/naklecha/minecraft-question-answer-700k
# 指定数据集的URL
# 加载数据集并添加来源URL
dataset = load_dataset("naklecha/minecraft-question-answer-700k",
                       split="train" )

## 如果出现网络问题，请手动下载数据集

# 从本地json读取数据集

In [66]:
import json

# 打开本地的JSON文件
with open('./dataset/minecraft-question-answer-700k.json', 'r') as file:
    # 读取文件内容
    data = json.load(file)

# 处理JSON数据
# 例如，打印JSON数据
# [{},{},{}]

print('数量：',len(data))

数量： 694814


In [67]:
from datasets import Dataset
my_data = {"train": data}
dataset = Dataset.from_dict(my_data)

In [68]:
# 查看数据集的前2条，每一条由 question answer source 三个字段构成
dataset[0:2]

{'train': [{'answer': 'Saturation is the first statistic to decrease when a player performs energy-intensive actions, and it must be completely depleted before the visible hunger meter begins decreasing.',
   'question': 'What is the first statistic to decrease when a player performs energy-intensive actions in Minecraft?',
   'source': 'https://minecraft.wiki/w/Food#Nourishment_value'},
  {'answer': 'Eating cake is distinct from other foods, as it must be placed and then right-clicked on to consume, whereas other foods can be eaten directly by the player. Additionally, cake has 7 edible slices, which become thinner as each slice is removed, whereas other foods typically restore a set amount of hunger and saturation points without any slice-based consumption mechanism.',
   'question': 'How does the game handle the consumption of cake when compared to eating other types of food?',
   'source': 'https://minecraft.wiki/w/Food#Nourishment_value'}]}

# 安装openai的库，用来处理数据集

In [29]:
# 安装openai的库，用来处理数据集
!pip install openai

Looking in indexes: https://pypi.python.org/simple, https://pypi.ngc.nvidia.com
Collecting openai
  Downloading openai-1.23.6-py3-none-any.whl (311 kB)
                                              0.0/311.6 kB ? eta -:--:--
     ---                                     30.7/311.6 kB 1.4 MB/s eta 0:00:01
     ---                                     30.7/311.6 kB 1.4 MB/s eta 0:00:01
     -------                               61.4/311.6 kB 409.6 kB/s eta 0:00:01
     -------------                        112.6/311.6 kB 656.4 kB/s eta 0:00:01
     --------------                       122.9/311.6 kB 554.9 kB/s eta 0:00:01
     ----------------------------         245.8/311.6 kB 888.8 kB/s eta 0:00:01
     -------------------------------------- 311.6/311.6 kB 1.1 MB/s eta 0:00:00
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.7.1-py3-none-any.whl (409 kB)
                      


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


# 使用LLM来处理数据-支持本地LLM、OpenAI、Azure

In [37]:
import openai
from openai import AzureOpenAI
import random

def read_key_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        return text
        
def get_client(api_key,base_url,deployment,is_azure=False):
    client=None
    if is_azure:  
        #gets the API Key from environment variable AZURE_OPENAI_API_KEY
        client = AzureOpenAI(
            # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#rest-api-versioning
            api_version="2023-07-01-preview",
            # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource
            azure_endpoint=base_url,#"https://example-endpoint.openai.azure.com"
            azure_ad_token=api_key,
            azure_deployment=deployment,
            default_headers = {'api-key': api_key }
        )
    else:
        openai.api_key = api_key
        # all client options can be configured just like the `OpenAI` instantiation counterpart
        openai.base_url =base_url# "http://127.0.0.1:8000/v1/"
        openai.default_headers = {"x-foo": "true"}
        
        client=openai
    
    return client

# 通过prompt，凭空构建数据集

In [38]:
key=read_key_from_file("key.txt")
# client=get_client("...","http://127.0.0.1:8000/v1/","",False)
client=get_client(key,"https://mixcopilot.openai.azure.com","gpt-35-turbo-16k",True)

model="gpt-35-turbo-16k"

def chat(messages,temperature=0.5,max_tokens=1354):
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )

    return completion.choices[0].message.content


prompt = "A model that takes in Chinese sentences, then rewrites them in the style of Lu Xun"
temperature = .5
number_of_examples = 100

# 创建
def generate_example(prompt, prev_examples, temperature=.5):
    messages=[
        {
            "role": "system",
            "content": f"You are generating data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\nprompt\n-----------\nresponse_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model."
        }
    ]

    if len(prev_examples) > 0:
        if len(prev_examples) > 10:
            prev_examples = random.sample(prev_examples, 10)
        for example in prev_examples:
            messages.append({
                "role": "assistant",
                "content": example
            })
    print(messages)
    messages.append({
                "role": "user",
                "content": f"Here is the type of model we want to train:\n`{prompt.strip()}`",
            })
    
    return chat(messages,temperature)


# Generate examples
prev_examples = []
for i in range(number_of_examples):
    print(f'Generating example {i}')
    example = generate_example(prompt, prev_examples, temperature)
    prev_examples.append(example)

print(prev_examples)

Generating example 0
[{'role': 'system', 'content': 'You are generating data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\nprompt\n-----------\nresponse_goes_here\n-----------\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.'}]
Generating example 1
[{'role': 'system', 'content': 'You are generating data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\n


KeyboardInterrupt



In [26]:
def generate_system_message(prompt):
    messages=[
          {
            "role": "system",
            "content": "You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given WHAT_THE_MODEL_SHOULD_DO.`.\n\nMake it as concise as possible. Include nothing but the system prompt in your response.\n\nFor example, never write: `\"SYSTEM_PROMPT_HERE`."
          },
          {
              "role": "user",
              "content": prompt.strip(),
          }
        ]

    return chat(messages,temperature,500)

system_message = generate_system_message(prompt)

print(f'system prompt是: `{system_message}`. 如果不满意，请重新生成')

system prompt是: `Given a Chinese sentence, rewrite it in the style of Lu Xun and output short phrases for chatting, with added emojis.`. 如果不满意，请重新生成


In [30]:
import pandas as pd

# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from examples
for example in prev_examples:
  try:
    split_example = example.split('-----------')
    prompts.append(split_example[1].strip())
    responses.append(split_example[3].strip())
  except:
    pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')

df.head()

There are 10 successfully-generated examples. Here are the first few:


Unnamed: 0,prompt,response
0,Rewrite the following Chinese sentence in the ...,今天的天气真是宜人，阳光明媚，大地万物焕发生机。
1,Rewrite the following Chinese sentence in the ...,吾爱汝，永不分离。
2,Rewrite the following Chinese sentence in the ...,他乃是一位勇敢而坚定之人。
3,Rewrite the following Chinese sentence in the ...,此都市充盈着繁忙与喧嚣。
4,Rewrite the following Chinese sentence in the ...,他们在夜晚的街道上闲逛。


In [35]:
import json

# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Convert dataframes to dictionaries
train_dict = train_df.to_dict(orient='records')
test_dict = test_df.to_dict(orient='records')

# Save dataframes to JSON files
with open('./dataset/train.jsonl', 'w', encoding='utf-8') as train_file:
    for record in train_dict:
        json.dump(record, train_file, ensure_ascii=False, indent=4)
        train_file.write('\n')

with open('./dataset/test.jsonl', 'w', encoding='utf-8') as test_file:
    for record in test_dict:
        json.dump(record, test_file, ensure_ascii=False, indent=4)
        test_file.write('\n')

        

# train_dataset = load_dataset('json', data_files='./dataset/train.jsonl', split="train")
# valid_dataset = load_dataset('json', data_files='./dataset/test.jsonl', split="train")

# 另一种方法,基于已有数据集，做改写

In [69]:
key=read_key_from_file("key.txt")
# client=get_client("...","http://127.0.0.1:8000/v1/","",False)
client=get_client(key,"https://mixcopilot.openai.azure.com","gpt-35-turbo-16k",True)

model="gpt-35-turbo-16k"

# 填写本地LLM的地址,修改prompt
def parse_text(content):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "your are chinese,use chinese ", # 中文输出
            },
            {
                "role": "user",
                "content": content,
            },
        ],
    )
    
    return completion.choices[0].message.content

# 使用LLM 把数据集处理成新的数据

In [87]:
json_path="./dataset/minecraft_science_fiction_story_zh"

import hashlib
# 取唯一id，去重
def get_string_hash(string):
    md5_hash = hashlib.md5()
    md5_hash.update(string.encode('utf-8'))
    return md5_hash.hexdigest()

# 判断是否已经存在
def add_new_data(original_list, item):
    is_has=False
    for element in original_list:
        if item['id']==element["id"]:
            is_has=True
    if is_has==False:
        original_list.append(item)
    return original_list


new_dataset=[]

try:
    # 打开本地的JSON文件
    with open(json_path, 'r') as file:
        # 读取文件内容
        new_dataset = (json.load(file))["train"]
except:
    print(f"")






In [88]:

for data in dataset[0:10]['train']:
    # print(data)
    question=data['question']
    answer=data['answer']

    _id=get_string_hash(question+answer)
    
    print(_id,question)
    print('')
    text=parse_text('''{0} , {1}
    结合以上文本信息，写成一个科幻小故事，200字以内的小故事。
    '''.format(question,answer))
 
    keywords=parse_text('''{0}
    提炼3个中文关键词，输出：#关键词
    '''.format(text))
    print('')
    print('------')
    print(keywords)
    print('------')
    print('')
    print(text)
    print('------')
    print('')

    new_dataset=add_new_data(new_dataset,{
        "id":_id,
        "question":keywords,#新数据
        "answer":text,#新数据
        # "_question":question, #保留原始的数据
        # "_answer":answer
    })
    # new_dataset.append({
    #     "id":_id,
    #     "question":keywords,#新数据
    #     "answer":text,#新数据
    #     # "_question":question, #保留原始的数据
    #     # "_answer":answer
    # })
    print(len(new_dataset))
    print('')

332f076d6fc48826ad89009600994aab What is the first statistic to decrease when a player performs energy-intensive actions in Minecraft?


------
《飢渴与能量之源》这部作品讲述了高度智能生命体维格拉及其神秘的能量体系"饥渴之源",以及如何在MineCraft游戏中探索和平衡自身的能量欲望与需求。关键词为:#维格拉 #饥渴之源 #MineCraft
------

《飢渴与能量之源》
在一片虚构的星球上,存在着一种名为"维格拉"的神秘生物。维格拉是一种高度智能且具有奇妙能力的生命体,其能力可谓是远超人类的多重 folds。

当人们首次接触到维格拉时,便被它们高深莫测的能量世界所吸引。传说中,维格拉使用自身的能量来进行各类工作:从筑造壮丽的建筑到种植繁茂的生态系统等等。这种独特的力量来源于其体内的"饥渴之源",一种神秘的能量体系。

在这一星球中,人们创造出了一款名为"MineCraft"的游戏。MineCraft是一个开放世界的探险游戏,在其中,玩家可以自由地挖掘矿石、建造建筑物和冒险。然而,这种看似简单的游戏却蕴含着复杂的能量体系。

玩 MineCraft 需要消耗大量的能量,这被称为"饥渴"。每当玩家使用能量来完成任务或施展能力时,他们便会感受到体内的饥饿感,其饥渴指标就会降低。

但是,这里出现了一个悖论:如果玩家继续使用能量,直到饥渴指标完全耗尽,他们的身体就会开始进入一种被称为"饥肠寸断"状态。这时,身体所需的能量将会从另一个地方获得。

在这种情况下,玩家必须学会平衡自身的饥饿与能力。在饥渴指标降低的情况下,玩家需要对自己的行为进行反思和控制。因此,维格拉和MineCraft之间的联系便诞生了:在探索神秘能量世界的同时,人们也可以体会到自身所需的能量与欲望之间的紧张关系。

正是由于这一切,我才相信,在这个充满未知的星球上,我们的每一个选择都需要我们深刻反思自己。
------

1

55e053294c42c5bd833ed8c975732e90 How does the game handle the consumption of cake when compared to eating other typ

In [90]:
# 保存新数据
new_dataset_to_save = {"train": new_dataset}

def save_json_to_file(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

save_json_to_file(new_dataset_to_save,"./dataset/minecraft_science_fiction_story_zh")

请在此处写下您的提示。尽可能详细地描述！
然后，选择生成数据时要使用的温度（介于0和1之间）。较低的值非常适合精确的任务，如编写代码，而较大的值则更适合创造性的任务，如编写故事。
最后，选择要生成的示例数量。生成的示例数量越多，a) 生成时间越长，b) 数据生成的成本越高。但一般来说，生成更多的示例会导致更高质量的模型。通常，最小值为100。
