# Package Installation

In [1]:
#@title Huggingface Login
#@markdown huggingface weight 를 이용하고 싶다면 로그인 필수
from google.colab import userdata
import os

os.environ['HF_WRITE_TOKEN'] = userdata.get('HF_WRITE_TOKEN')
os.environ['HF_TOKEN'] = userdata.get('HF_WRITE_TOKEN')

!huggingface-cli login --add-to-git-credential --token $HF_WRITE_TOKEN


Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
%%writefile requirements.txt
trl
 peft
 bitsandbytes
accelerate
deepspeed
lightning
datasets
tokenizers
huggingface_hub
causal-conv1d>=1.1.0
mamba-ssm[causal-conv1d]
PyYAML>=6.0.2
transformers @git+https://github.com/huggingface/transformers.git
open_lm @git+https://github.com/mlfoundations/open_lm.git
mergekit @git+https://github.com/cg123/mergekit.git

Writing requirements.txt


In [3]:
%%capture
!pip install -r requirements.txt -U
# !pip uninstall transformers -y && pip install transformers --no-cache

# Merging Model

In [None]:
#@markdown Transformers version test
from transformers import AutoTokenizer, AutoModelForCausalLM
import traceback
from open_lm.hf import *

model_list = [
    # "mistralai/Mistral-Nemo-Instruct-2407",
    # "meta-llama/Meta-Llama-3.1-8B-Instruct",
    # "google/gemma-2-9b-it",
    # "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
    "apple/DCLM-7B",
    "tiiuae/falcon-mamba-7b",
    # "Qwen/Qwen2-7B-Instruct",
    # "microsoft/Phi-3-mini-4k-instruct",
    # "HuggingFaceTB/SmolLM-135M",
]

for model_id in model_list:
    try:
        print(f"{model_id} on load")
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
    except ValueError as e:
        traceback.format_exc()
        print(f"{model_id} failed to load \n {e}")
        model, tokenizer = None, None
    finally:
        print(f"{model_id} load task done")
        del model, tokenizer

In [None]:
#@title Passthrough
#@markdown 서로 다른 언어모델 레이어를 연결하여 새로운 모델은 만드는 방법

import yaml

MODEL_NAME = "Phi-Small-Merge"

yaml_config = """
slices:
  - sources:
    - model: Antonio88/TaliML-PHI3-128K-ITA-V.1.0.FINAL
      layer_range: [0, 16]
  - sources:
    - model: Muhammad2003/Orpo-Phi3-3B-128K
      layer_range: [16, 32]
  - sources:
    - model: Antonio88/TaliML-PHI3-128K-ITA-V.1.0.FINAL
      layer_range: [0, 32]
  - sources:
    - model: jpacifico/Chocolatine-3B-Instruct-DPO-Revised
      layer_range: [0, 32]
  - sources:
    - model: Ejafa/phi-3-mini-128k-instruct-simpo-lr-5e-07-gamma-1.5
      layer_range: [0, 32]
base_model: Antonio88/TaliML-PHI3-128K-ITA-V.1.0.FINAL
merge_method: passthrough
dtype: bfloat16
"""

yaml_config = """
slices:
  - sources:
    - model: Antonio88/TaliML-PHI3-128K-ITA-V.1.0.FINAL
      layer_range: [0, 32]
  - sources:
    - model: jpacifico/Chocolatine-3B-Instruct-DPO-Revised
      layer_range: [0, 32]
  - sources:
    - model: Ejafa/phi-3-mini-128k-instruct-simpo-lr-5e-07-gamma-1.5
      layer_range: [0, 32]
base_model: jpacifico/Chocolatine-3B-Instruct-DPO-Revised
merge_method: slerp
parameters:
  t:
    - filter: self_attn
      value: [0, 0.5, 0.3, 0.7, 1]
    - filter: mlp
      value: [1, 0.5, 0.7, 0.3, 0]
    - value: 0.5
  normalize: false
  int8_mask: true
  density: 0.7
  lambda: 1.1
  epsilon: 0.2
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config)


In [None]:
#@title Linear(Model Soups)
#@markdown 가중치에 따른 평균으로 결합 <p>
#@markdown 가장 전통적인 방법

import yaml

MODEL_NAME = "Llama-Merge-Small"

yaml_config = """
slices:
  - sources:
    - model: meta-llama/Meta-Llama-3.1-8B-Instruct
      layer_range: [0, 32]
      parameters:
        weight: 0.5
    - model: KISTI-KONI/KONI-Llama3-8B-Instruct-20240729
      layer_range: [0, 32]
      parameters:
        weight: 0.5
merge_method: linear
parameters:
    normalize: true
base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())


In [None]:
#@title SLERP
#@markdown 두 모델의 가중치를 구면 선형 보간법을 이용하여 결합 <p>
#@markdown 벡터 크기가 아닌 방향에 초점을 맞춰 부드럽게 결합
import yaml

MODEL_NAME = "Openchat-Llama-Merge"

yaml_config = """
slices:
  - sources:
    - model: openchat/openchat-3.6-8b-20240522
      layer_range: [0, 32]
    - model: meta-llama/Meta-Llama-3.1-8B
      layer_range: [0, 32]
merge_method: slerp
base_model: openchat/openchat-3.6-8b-20240522
parameters:
  t:
    - filter: self_attn
      value: [0, 0.5, 0.3, 0.7, 1]
    - filter: mlp
      value: [1, 0.5, 0.7, 0.3, 0]
    - value: 0.5
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())


In [None]:
#@title Task Arithmetic
#@markdown Task Vector 기반 결합<p>
#@markdown task vector 를 생성하여 이를 base model에 반영<p>
#@markdown finetuning 된 파생모델들 결합에 효과적

import yaml

MODEL_NAME = "Mistral-Ko-Merge"

yaml_config = """
name: mistral-ko-merge
merge_method: task_arithmetic
base_model: Edentns/DataVortexM-7B-Instruct-v0.1
parameters:
    normalized: false
    weight: 1
dtype: bfloat16
models:
    - model: Edentns/DataVortexM-7B-Instruct-v0.1
    - model: refarde/Mistral-7B-Instruct-v0.2-Ko-S-Core
    - model: Alphacode-AI/AlphaMist7B-slr-v3
    - model: AIdenU/Mistral-7b-ko-Y24-DPO_v0.1
"""
yaml_config = """
slices:
  - sources:
      - model: Edentns/DataVortexM-7B-Instruct-v0.1
        layer_range: [0, 32]
        parameters:
            weight: 1.0
      - model: refarde/Mistral-7B-Instruct-v0.2-Ko-S-Core
        layer_range: [0, 32]
        parameters:
            weight: 1.0
      - model: Alphacode-AI/AlphaMist7B-slr-v3
        layer_range: [0, 32]
        parameters:
            weight: 1.0
      - model: AIdenU/Mistral-7b-ko-Y24-DPO_v0.1
        layer_range: [0, 32]
        parameters:
            weight: 1.0
merge_method: della
base_model: Edentns/DataVortexM-7B-Instruct-v0.1
parameters:
  normalize: true
  int8_mask: true
  density: 0.7
  lambda: 1.1
  epsilon: 0.2
dtype: bfloat16
"""
# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())


In [None]:
#@title TIE
#@markdown 여러 모델의 임베딩 가중치를 효과적으로 결합<p>
#@markdown 여러 모델 가중치를 하나의 공간에 투영 <p>
#@markdown 부호가 일치하는 가중치는 강하게 결합, 일치하지 않으면 약하게 결합

import yaml

MODEL_NAME = "Flying-Spaghetti-Merge"

yaml_config = """
slices:
  - sources:
      - model: MLP-KTLim/llama-3-Korean-Bllossom-8B
        layer_range: [0, 32]
        parameters:
            density: 0.5
            weight: 0.3
      - model: beomi/Llama-3-Open-Ko-8B-Instruct-preview
        layer_range: [0, 32]
        parameters:
            density: 0.5
            weight: 0.3
      - model: maywell/Llama-3-Ko-8B-Instruct
        layer_range: [0, 32]
        parameters:
            density: 0.5
            weight: 0.3
      - model: tesser-ai/Tesser-Llama-3-Ko-8B
        layer_range: [0, 32]
        parameters:
            density: 0.5
            weight: 0.3
      - model: mistralai/Mistral-7B-Instruct-v0.3
        layer_range: [0, 32]
        parameters:
            density: 0.5
            weight: 0.3
merge_method: tie
parameters:
    normalize: false
base_model: mistralai/Mistral-7B-Instruct-v0.3
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())

In [None]:
#@title DELLA
#@markdown 가중치를 효율적으로 결합 <p>
#@markdown 가중치에 대한 중요도설정하여 탄력적이고 희소(sparse)한 방법으로 결합
import yaml

MODEL_NAME = "SOLAR-Della-Merge"

yaml_config = """
slices:
  - sources:
      - model: beomi/OPEN-SOLAR-KO-10.7B
        layer_range: [0, 48]
        parameters:
            weight: 1.0
      - model: upstage/SOLAR-10.7B-v1.0
        layer_range: [0, 48]
        parameters:
            weight: 1.0
      - model: NousResearch/Yarn-Solar-10b-64k
        layer_range: [0, 48]
        parameters:
            weight: 1.0
      - model: Edentns/DataVortexS-10.7B-dpo-v1.6
        layer_range: [0, 48]
        parameters:
            weight: 1.0
      - model: upstage/SOLAR-10.7B-Instruct-v1.0
        layer_range: [0, 48]
        parameters:
            weight: 1.0
merge_method: della
base_model: upstage/SOLAR-10.7B-Instruct-v1.0
parameters:
  normalize: true
  int8_mask: true
  density: 0.7
  lambda: 1.1
  epsilon: 0.2
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())

In [10]:
#@title DARE
#@markdown 모델의 weight 일부를 base model 값으로 되돌림<p>
#@markdown 삭제된 가중치 보완을 위해 남은 가중치 재조정
#@markdown ties, linear 보간 선택 가능

import yaml

MODEL_NAME = "Llama-Ko-Merge"

yaml_config = """
slices:
  - sources:
      - model: MLP-KTLim/llama-3-Korean-Bllossom-8B
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: lcw99/llama-3-8b-it-ko-chang
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: beomi/Llama-3-Open-Ko-8B-Instruct-preview
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: maywell/Llama-3-Ko-8B-Instruct
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: VIRNECT/llama-3-Korean-8B-V3
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: tesser-ai/Tesser-Llama-3-Ko-8B
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: maum-ai/Llama-3-MAAL-8B-Instruct-v0.1
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: meta-llama/Meta-Llama-3.1-8B
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: NousResearch/Hermes-3-Llama-3.1-8B
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: Saxo/Linkbricks-Horizon-AI-Korean-llama3-sft-dpo-8b-base
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
      - model: Saxo/Linkbricks-Horizon-AI-Korean-llama-3.1-sft-dpo-8B
        layer_range: [0, 32]
        parameters:
            density: 0.7
            weight: 0.2
merge_method: dare_ties
base_model: Saxo/Linkbricks-Horizon-AI-Korean-llama-3.1-sft-dpo-8B
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())

In [4]:
#@title Breadcrumbs
#@markdown task arithmetic 에서 base model과 근소하거나 극단적인 차이를 보이는 부분을 버림 <p>
#@markdown ties, linear 보간 선택 가능

import yaml

MODEL_NAME = "Gemma-Ko-Merge"

yaml_config = """
slices:
  - sources:
      - model: lemon07r/Gemma-2-Ataraxy-9B
        layer_range: [0, 42]
        parameters:
            weight: 1
            density: 0.7
            gamma: 0.03
      - model: wzhouad/gemma-2-9b-it-WPO-HB
        layer_range: [0, 42]
        parameters:
            weight: 1
            density: 0.42
            gamma: 0.03
      - model: rtzr/ko-gemma-2-9b-it
        layer_range: [0, 42]
        parameters:
            weight: 1
            density: 0.42
            gamma: 0.03
      - model: rtzr/ko-gemma-2-9b-it+ghost613/gemma9_on_korean_summary_events # lora model loading
        layer_range: [0, 42]
        parameters:
            weight: 1
            density: 0.42
            gamma: 0.03
merge_method: breadcrumbs_ties
base_model: lemon07r/Gemma-2-Ataraxy-9B
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())

In [None]:
#@title Model Stock
#@markdown finetuned 모델의 기하학적 특정을 살려 weight를 선형 보간<p>
#@markdown Base model 포함 최소 3개 이상의 모델 필요

import yaml

MODEL_NAME = "Hermes-Llama-Merge"

yaml_config = """
slices:
  - sources:
      - model: openchat/openchat-3.6-8b-20240522
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: MLP-KTLim/llama-3-Korean-Bllossom-8B
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: lcw99/llama-3-8b-it-ko-chang
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: beomi/Llama-3-Open-Ko-8B-Instruct-preview
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: maywell/Llama-3-Ko-8B-Instruct
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: tesser-ai/Tesser-Llama-3-Ko-8B
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: maum-ai/Llama-3-MAAL-8B-Instruct-v0.1
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: meta-llama/Meta-Llama-3.1-8B
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
      - model: NousResearch/Hermes-3-Llama-3.1-8B
        layer_range: [0, 32]
        parameters:
            filter_wise: false # calculate with per-tensor. Not recommended
merge_method: model_stock
base_model: NousResearch/Hermes-3-Llama-3.1-8B
dtype: bfloat16
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())

In [None]:
#@title Mixture of Experts
#@markdown finetuned task specific model 결합

import yaml

MODEL_NAME = "Llama-Ko-MoE"

yaml_config = """
base_model: NousResearch/Hermes-2-Pro-Mistral-7B
gate_mode: hidden
dtype: bfloat16
experts:
  - source_model: NousResearch/Hermes-2-Pro-Mistral-7B
    positive_prompts:
      - "<|im_start|>user\nHello, who are you?<|im_end|>"
      - "<|im_start|>user\nI need help with"
  - source_model: BioMistral/BioMistral-7B-DARE
    positive_prompts:
      - "As a doctor of medicine,"
  - source_model: PocketDoc/Dans-AdventurousWinds-7b
    positive_prompts:
      - "[Genres: Science Fiction]\n[Tags: humor, old school, sci fi]"
      - "> get ye flask"
      - "[Mode: Interactive Storyteller]"
  - source_model: VAGOsolutions/SauerkrautLM-7b-HerO
    positive_prompts:
      - "<|im_start|>user\nWie geht es dir?<|im_end|>"
      - "Das ist ein Satz auf Deutsch."
"""

# Save config as yaml file
with open('config.yaml', 'w', encoding="utf-8") as f:
    f.write(yaml_config.strip())

In [None]:
# Merge models
%%sh
mergekit-yaml config.yaml merge \
    --verbose \
    --copy-tokenizer \
    --allow-crimes \
    --out-shard-size 2B \
    --lazy-unpickle \
    --clone-tensors \
    --no-read-to-gpu \
    --no-cuda \
    --transformers-cache mergekit_cache \
    --safe-serialization \
    --trust-remote-code

In [None]:
!sudo rm -r merge

can merge with codes<p>
problem : cannot access huggingface hub private repo or authenication repo

```python
# merge_models.py
import torch
import yaml
from mergekit.config import MergeConfiguration
from mergekit.merge import MergeOptions, run_merge

CONFIG_YML = "config.yaml"
OUTPUT_PATH = "./merged_model"

with open(CONFIG_YML, "r", encoding="utf-8") as fp:
    merge_config = MergeConfiguration.model_validate(yaml.safe_load(fp))

run_merge(
    merge_config,
    out_path=OUTPUT_PATH,
    options=MergeOptions(
        cuda=torch.cuda.is_available(),
        copy_tokenizer=True,
        lazy_unpickle=True,
        low_cpu_memory=False,
        allow_crimes=True,
        verbose=True,
        trust_remote_code=True,
        clone_tensors=True,
    ),
)
print("Model merge completed!")
```

In [None]:
from huggingface_hub import ModelCard, ModelCardData
from jinja2 import Template

username = "Gunulhona"

template_text = """
---
license: apache-2.0
tags:
- merge
- mergekit
- lazymergekit
{%- for model in models %}
- {{ model }}
{%- endfor %}
---

# {{ model_name }}

{{ model_name }} is a merge of the following models using [mergekit](https://github.com/cg123/mergekit):

{%- for model in models %}
* [{{ model }}](https://huggingface.co/{{ model }})
{%- endfor %}

## 🧩 Configuration

'''yaml
{{- yaml_config -}}
'''
"""

# Create a Jinja template object
jinja_template = Template(template_text.strip())

# Get list of models from config
data = yaml.safe_load(yaml_config)
if "models" in data:
    models = [data["models"][i]["model"] for i in range(len(data["models"])) if "parameters" in data["models"][i]]
elif "parameters" in data:
    models = [data["slices"][0]["sources"][i]["model"] for i in range(len(data["slices"][0]["sources"]))]
elif "slices" in data:
    models = [data["slices"][i]["sources"][0]["model"] for i in range(len(data["slices"]))]
else:
    raise Exception("No models or slices found in yaml config")

# Fill the template
content = jinja_template.render(
    model_name=MODEL_NAME,
    models=models,
    yaml_config=yaml_config,
    username=username,
)

# Save the model card
card = ModelCard(content)
card.save('merge/README.md')


In [None]:
from google.colab import userdata
from huggingface_hub import HfApi
import os

username = "Gunulhona"

# Defined in the secrets tab in Google Colab
api = HfApi(token=os.environ['HF_WRITE_TOKEN'])
try:
    api.delete_repo(
        repo_id=f"{username}/{MODEL_NAME}",
        repo_type="model"
    )
    api.create_repo(
        repo_id=f"{username}/{MODEL_NAME}",
        repo_type="model"
    )
except:
    api.create_repo(
        repo_id=f"{username}/{MODEL_NAME}",
        repo_type="model"
    )
finally:
    api.upload_folder(
        repo_id=f"{username}/{MODEL_NAME}",
        folder_path="merge",
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model-00003-of-00012.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

model-00002-of-00012.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

model-00005-of-00012.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00012.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

model-00004-of-00012.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Upload 13 LFS files:   0%|          | 0/13 [00:00<?, ?it/s]

model-00006-of-00012.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00012.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00012.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00009-of-00012.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model-00010-of-00012.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model-00011-of-00012.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model-00012-of-00012.safetensors:   0%|          | 0.00/529M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

if "username" not in locals():
    username = "Gunulhona" #@param {"type":"string"}
if "MODEL_NAME" not in locals():
    MODEL_NAME = "Gemma-Ko-Merge" #@param {"type":"string"}

model_id = f"{username}/{MODEL_NAME}"

print(f'''
+++++++++++++++++++++++++++++++++++++++++
     {model_id}
+++++++++++++++++++++++++++++++++++++++++
''')

messages = [{
    "role": "system",
    "content": """
Summarize the following text in a concise manner, focusing on the key points and main ideas while preserving the essential details:
Artificial Intelligence (AI) has seen rapid advancements over the past decade. Technologies like deep learning and natural language processing have enabled AI to perform tasks previously thought to be impossible for machines, such as understanding and generating human language, recognizing objects in images, and even driving cars autonomously. These developments have been driven by the availability of large datasets, increased computational power, and the refinement of algorithms. However, there are also concerns about the ethical implications of AI, including privacy issues, job displacement, and the potential for AI to be used in harmful ways.
Provide a summary in 300 words.
---
[대화]
참석자_1: ...
참석자_2: ...
---
[요약]
* 키워드 요약 ...
* ...
---
'''
"""
},{
    "role": "user",
    "content": """---
[대화]
참석자_1: 안녕하세요, 선생님.
참석자_2: 안녕하세요, 의사 선생님.
참석자_1: 마흔 넷이시죠?
참석자_2: 네, 선생님.
참석자_1: 좋아요, 오늘은 무슨 문제가 있는 것 같나요?
참석자_2: 의사 선생님, 한동안 허리 통증이 있었습니다.
참석자_1: 통증이 다리로 내려가나요?
참석자_2: 네, 오른쪽 허벅지에도 통증이 있습니다.
참석자_1: 이 통증과 관련된 부상이 있습니까?
참석자_2: 네, 1994년에 사고가 있었습니다.
참석자_1: 최초 부상 당시의 서류나 의료 기록이 있습니까?
참석자_2: 아니요, 오늘은 없습니다.
참석자_1: 직업이 어떻게 되십니까?
참석자_2: 지금은 타코벨에서 일합니다. 산재 보험 청구가 열려 있습니다.
참석자_1: 거기서 일하다가 통증이 재발했죠?
참석자_2: 네, 맞습니다.
참석자_1: 마지막으로 이곳에서 진료를 받은 것이 언제였는지 기억하십니까?
참석자_2: 음, 네, 4월 12일 2005년이었습니다.
참석자_1: 10이 상상할 수 있는 최악의 통증이라면, 마지막 방문 시 통증은 10점 만점에 어느 정도였습니까?
참석자_2: 음, 10점 만점에 8점 정도였어요.
참석자_1: 이 통증 때문에 약을 복용하셨나요?
참석자_2: 음, 지난번 방문했을 때 메드롤 도스팩을 처방받았습니다.
참석자_1: 도세팍에 통증이 어떻게 반응했나요?
참석자_2: 통증이 10점 만점에 4~5점 정도로 줄었습니다.
참석자_1: 통증이 있는 곳을 가리켜 주시겠습니까?
참석자_2: 네, 바로 여기입니다.
참석자_1: 여기 이 밴드요?
참석자_2: 네, 바로 그 자리입니다.
참석자_1: 좋아요, 여기는 요추 4번과 천골 사이입니다. 오른쪽 다리 통증을 어떻게 설명하시겠습니까?
참석자_2: 지금은 간헐적이고 미미하며 항상 있는 것은 아닙니다.
참석자_1: 허리 수술을 받은 적이 있습니까?
참석자_2: 음, 네, 1990년에 한 번, 1994년에 한 번 두 번 척추 절제술을 받았습니다. 잠깐만요, 그 사이에 디스크 절제술도 받았어요.
참석자_1: 어디에 초점이 맞춰졌는지 아십니까?
참석자_2: L 4 L 5번이었습니다.
참석자_1: 허리에 대한 영상 촬영은 하셨나요?
참석자_2: 네, 10월 18일 2004년에 MRI를 찍었습니다. 여기 보고서가 있습니다.
참석자_1: 좋아요, 이것은 다단계 퇴행성 변화를 보여 주며, L 2 L 3, L 3 L 4, L 5 S1에서 신경 침범이 없는 다단계 퇴행성 변화를 보여 주며, 이는 양호합니다.
참석자_2: 그게 무슨 뜻인가요, 의사 선생님?
참석자_1: 요약하자면, 허리에 상당한 양의 관절염이 있다는 뜻입니다.
참석자_2: 네, M R 골수 조영술도 받았는데 여기 보고서가 있습니다.
참석자_1: 좋아요, 요추 3번에서 심한 척추관 협착증이 보이지만 인공물일 수도 있습니다.
참석자_2: 그게 무슨 뜻인가요?
참석자_1: 이 소견은 잘못된 해석일 수 있습니다.
 ---
[요약]
*
"""
}]

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True,)
try:
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except:
    tokenizer.bos_token = "<|begin_of_text|>"
    tokenizer.chat_template= "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"
    tokenizer.clean_up_tokenization_spaces =True
    tokenizer.eos_token = "<|eot_id|>"
finally:
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

if "pipeline" in locals():
    del pipeline

pipeline = transformers.pipeline(
    task="text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    # batch_size=4,
    trust_remote_code=True,
    # use_fast=True,
    model_kwargs={"ignore_mismatched_sizes": True}
)

outputs = pipeline(
    [prompt],
    max_new_tokens=100,
    do_sample=True,
    temperature=0.2,
    top_p=0.95
)

for message in outputs:
    print(message[0]["generated_text"])



+++++++++++++++++++++++++++++++++++++++++
     Gunulhona/Openchat-Llama-Merge
+++++++++++++++++++++++++++++++++++++++++



tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/750 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/9 [00:00<?, ?it/s]

model-00001-of-00009.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00002-of-00009.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

model-00003-of-00009.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00009.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00009.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00009.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00009.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00009.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00009-of-00009.safetensors:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|><|start_header_id|>System<|end_header_id|>

Summarize the following text in a concise manner, focusing on the key points and main ideas while preserving the essential details:
Artificial Intelligence (AI) has seen rapid advancements over the past decade. Technologies like deep learning and natural language processing have enabled AI to perform tasks previously thought to be impossible for machines, such as understanding and generating human language, recognizing objects in images, and even driving cars autonomously. These developments have been driven by the availability of large datasets, increased computational power, and the refinement of algorithms. However, there are also concerns about the ethical implications of AI, including privacy issues, job displacement, and the potential for AI to be used in harmful ways.
Provide a summary in 300 words.
---
[대화]
참석자_1: ...
참석자_2: ...
---
[요약]
* 키워드 요약 ...
* ...
---
'''<|eot_id|><|start_header_id|>GPT4 Correct User<|end_hea

# PPO - RLHF

In [None]:
# prompt: DPO reinforcement learning LLM with Lightning

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import PPOConfig, PPOTrainer
from peft import LoraConfig, get_peft_model
import deepspeed

# Load the base model and tokenizer
model_id = f"{username}/{MODEL_NAME}"
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            # load_in_4bit=True,
                                            device_map="cpu",
                                            torch_dtype=torch.bfloat16,
                                            trust_remote_code=True,
                                            return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          trust_remote_code=True,
                                          use_fast=True)

# Define the Lora configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply Lora to the base model
model = get_peft_model(model, lora_config)

# Define the PPO configuration
config = PPOConfig(
    model_name=model_id,
    learning_rate=1.41e-5,
    log_with="wandb",
)

# Deepspeed 설정 정의 (Stage 2)
deepspeed_config = {
    "train_micro_batch_size_per_gpu": 1,
    "gradient_accumulation_steps": 1,
    "zero_optimization": {
        "stage": 2,
        "offload_param": {
            "device": "cpu",
            "pin_memory": True
        },
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        }
    }
}

optimizer_class = bnb.optim.PagedAdamW
optimizer_kwargs = {"lr": config.learning_rate}

# Initialize the PPOTrainer
ppo_trainer = PPOTrainer(
    config=config,
    model=model,
    tokenizer=tokenizer,
    deepspeed=deepspeed_config,
    optimizer_class=optimizer_class,
    optimizer_kwargs=optimizer_kwargs)

# Define your training data and reward function here
# ...

# Train the model using DPO
ppo_trainer.train(
    dataset=train_dataset,
    reward_fn=reward_fn)

# Save the trained model
ppo_trainer.save_pretrained("path/to/save/model")


In [None]:
def processing(sample,
               tokenizer):
    c = sample["chosen"]
    r = sample["rejected"]
    chosen = c[-2:]
    rejected = r[-2:]

    # print("chosen", chosen) # this should be chosen
    # print("rejected", rejected) # this should be rejected
    assert all(c for c in chosen[0] if c in reject[0]), "prompt not matched"
    prompt = chosen[0]


    history = []
    # print(len(c), len(r))
    for i in range(0, len(c), 2):  # this should be added to prompt
        c_pair = c[i:i+2]
        r_pair = r[i:i+2]
        if c_pair[0] == r_pair[0] and c_pair[1] not in chosen:
            history += c_pair

    history += [prompt]
    return {
        "prompt": tokenizer.apply_chat_template([prompt], tokenize=True,),
        "chosen": tokenizer.apply_chat_template([chosen[1]], tokenize=True),
        "rejected": tokenizer.apply_chat_template([rejected[1]], tokenize=True),
     }

def processing(sample,
               tokenizer):
    if len(sample["rejected"]) ==0:
        sample["rejected"] = [{"role":"assistant","content":""}]
    return {
        "prompt": tokenizer.apply_chat_template(sample["prompt"], tokenize=False,),
        "chosen": tokenizer.apply_chat_template(sample["chosen"], tokenize=False),
        "rejected": tokenizer.apply_chat_template(sample["rejected"], tokenize=False),
     }

def get_dataset(dataset_name: str,
                tokenizer):
    raw_dataset = load_dataset(
        dataset_name,
        trust_remote_code=True,
        revision="main",  # tag name, or branch name, or commit hash
        )

    return {
        dataset: raw_dataset[dataset].map(
            processing,
            batched=False,
            remove_columns=[n for n in raw_dataset.column_names if n not in ["train", "test"]],
            fn_kwargs={"tokenizer": tokenizer,}) for dataset in ["train", "test"]
        }


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOConfig, DPOTrainer  # DPOTrainer 사용
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb
# import deepspeed

# 데이터 가져오기
dataset = get_dataset(
    dataset_name="Gunulhona/open_dpo_merged",
    tokenizer=tokenizer)

# 모델과 토크나이저 불러오기
model_id = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            device_map="auto",
                                            torch_dtype=torch.bfloat16,
                                            trust_remote_code=True,
                                            return_dict=True)
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          trust_remote_code=True,
                                          use_fast=True)

# Lora 설정 정의
lora_config = LoraConfig(
    target_modules=[
        "dense",
        "o_proj",
        "qkv_proj"],
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM")

# Lora를 기본 모델에 적용
model = get_peft_model(model, lora_config)

# 참조 모델 불러오기 (필요에 따라 수정)
ref_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    return_dict=True)

# DPO 설정 정의
training_args = DPOConfig(
    beta=0.1,
    output_dir="dpo_output"
)

# Deepspeed 설정 정의 (Stage 2)
deepspeed_config = {
    "train_micro_batch_size_per_gpu": 1,
    "gradient_accumulation_steps": 1,
    "zero_optimization": {
        "stage": 2,
        "offload_param": {
            "device": "cpu",
            "pin_memory": True
        },
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        }
    }
}

# BitsandBytes Paged AdamW Optimizer 설정
optimizer_class = bnb.optim.PagedAdamW
optimizer_kwargs = {"lr": training_args.learning_rate}

# DPOTrainer 초기화 (Deepspeed, PagedAdamW 적용)
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    # data_callator=,
    train_dataset=dataset["train"],  # 학습 데이터셋
    eval_dataset=dataset["test"],  # 학습 데이터셋
    tokenizer=tokenizer,
    # deepspeed=deepspeed_config,
    optimizers=(bnb.optim.PagedAdamW, {"lr": 3e-5}),
)

# DPO를 사용하여 모델 학습
dpo_trainer.train()

Map:   0%|          | 0/505934 [00:00<?, ? examples/s]

Map:   0%|          | 0/126484 [00:00<?, ? examples/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizing train dataset:   0%|          | 0/505934 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4599 > 4096). Running this sequence through the model will result in indexing errors


Tokenizing eval dataset:   0%|          | 0/126484 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




In [None]:
# prompt: Lightning trainer 기본 구조 잡아줘

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

class MyLightningModule(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.layer_1 = nn.Linear(32, 128)
    self.layer_2 = nn.Linear(128, 10)

  def forward(self, x):
    x = self.layer_1(x)
    x = F.relu(x)
    x = self.layer_2(x)
    return x

  def training_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)
    self.log('val_loss', loss)

  def configure_optimizers(self):
    optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
    return optimizer


In [None]:
# prompt: huggingface 여러 datasets 다운로드 하는 코드

from datasets import load_dataset
import pandas as pd


# 예시: 여러 datasets 다운로드
dataset_names = [
    # "OpenAssistant/oasst1", # sft
    # "OpenAssistant/oasst2", # sft
    # "defunct-datasets/eli5",  # disabled for now
    "nomic-ai/gpt4all_prompt_generations", # sft
    "QingyiSi/Alpaca-CoT", # sft
    "yahma/alpaca-cleaned", # sft
    "royboy0416/ko-alpaca", # sft
    "openai/summarize_from_feedback", # sft
    "tatsu-lab/alpaca", # sft
    "databricks/databricks-dolly-15k", # sft
    "Gunulhona/llm_datasets", # sft
    "lightblue/tagengo-gpt4", # sft
    "linkanjarad/baize-chat-data", # sft
    ]

dataset_names += [
    "stanfordnlp/SHP", # dpo
    "openai/webgpt_comparisons", # dpo
    "Anthropic/hh-rlhf", # dpo
    "Hello-SimpleAI/HC3", # dpo
    "Unified-Language-Model-Alignment/Anthropic_HH_Golden", # dpo
    "argilla/dpo-mix-7k", # dpo
]

for dataset_name in dataset_names:
    dataset = load_dataset(dataset_name, trust_remote_code=True)
    print(f"Downloaded dataset: {dataset_name}")
    df = pd.DataFrame()


Downloaded dataset: nomic-ai/gpt4all_prompt_generations


Resolving data files:   0%|          | 0/45 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/36 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/18 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Exception ignored in: <function _xla_gc_callback at 0x7e187d9c7d00>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 98, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


DatasetGenerationCastError: An error occurred while generating the dataset

All the data files must have the same columns, but at some point there are 3 new columns ({'dataset', 'task_type', 'table_type'})

This happened while the json dataset builder was generating data using

hf://datasets/QingyiSi/Alpaca-CoT/Tabular-LLM-Data/Table-Fact-Verification/Infotabs/Infotabs-train-markdown.json (at revision 18add89e3b884703ec869a5c6e2bcf1412ee7edc)

Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)

In [None]:
# @title  Process Opensource Datasets
import numpy as np
import pandas as pd
from datasets import Dataset


def chose_short(str_1, str_2):
    str_list = [str_1, str_2]
    return str_list[np.argmin([len(s) for s in str_list])]

def chose_long(str_1, str_2):
    str_list = [str_1, str_2]
    return str_list[np.argmax([len(s) for s in str_list])]

def process_c_r(chosen, reject, is_list:bool=False):

    def split_as_role(text, role):
        return [s.strip() for s in text.split(role) if len(s) > 1]

    def text_to_template(text):
        result = []
        for h_text in split_as_role(text=text.strip().replace("Hum:", "Human:"), role="Human: "):
            try:
                prompt, response = split_as_role(h_text.strip(), role="Assistant: ")
                result += [
                    {
                        "role": "user",
                        "content": prompt,
                    },
                    {
                        "role": "assistant",
                        "content": response
                    }
                ]
            except Exception as e:
                # print(h_text)
                pass
        return result

    if is_list:
        pass
    else:
        chosen, reject = text_to_template(chosen), text_to_template(reject)

    assert all(c for c in chosen[:-2] if c in reject[:-2]), "prompt is different"
    prompt = chosen[:-2] + [c for c in chosen[-2:] if c["role"] == "user"]
    chosen_text = [c for c in chosen[-2:] if c["role"] == "assistant"]
    rejected_text = [r for r in reject[-2:] if r["role"] == "assistant"]

    return {
        "prompt": prompt,
        "chosen": chosen_text,
        "rejected": rejected_text
    }

dpo_datasets = [
    load_dataset("stanfordnlp/SHP").map(
        lambda x: {
            "prompt": [{"role":"user", "content": x["history"]}],
            "chosen": [{"role":"assistant", "content": x["human_ref_A"] if bool(x["labels"]) else x["human_ref_B"] }],
            "rejected":[{"role": "assistant", "content": x["human_ref_A"] if not bool(x["labels"]) else x["human_ref_B"] }]
        }),
    load_dataset("openai/webgpt_comparisons").map(
        lambda x:{
            "prompt": [{"role":"user", "content": x["question"]["full_text"]}],
            "chosen": [{"role":"assistant", "content": x["answer_0"] if x["score_0"] > x["score_1"] else x["answer_1"] if x["score_0"] < x["score_1"] else chose_short(x["answer_0"], x["answer_1"])}],
            "rejected":[{"role": "assistant", "content": x["answer_0"] if x["score_0"] < x["score_1"] else x["answer_1"] if x["score_0"] > x["score_1"] else chose_long(x["answer_0"], x["answer_1"])}]
        }),
    load_dataset("Deojoandco/anthropic-hh-rlhf").map(
        lambda x:{
            "prompt": [{"role":"user", "content": x["prompt"].replace("Human: ", "")}],
            "chosen": [{"role":"assistant", "content": x["chosen"]}],
            "rejected":[{"role": "assistant", "content": x["rejected"]}]
        }),
    load_dataset("Hello-SimpleAI/HC3", name="all").map(
        lambda x:{
            "prompt": [{"role":"user", "content": x["question"]}],
            "chosen": [{"role":"assistant", "content": " ".join(x["human_answers"])}],
            "rejected":[{"role": "assistant", "content": " ".join(x["chatgpt_answers"])}]
        }),
    load_dataset("Unified-Language-Model-Alignment/Anthropic_HH_Golden").map(
        lambda x: process_c_r(x["chosen"], x["rejected"])
    ),
    load_dataset("argilla/dpo-mix-7k").map(
        lambda x: process_c_r(x["chosen"], x["rejected"], is_list=True)
    )
]

chosen_all = []
prompt_all = []
rejected_all = []

for dpo_dataset in dpo_datasets:
    chosen_all += dpo_dataset["train"]["chosen"]
    prompt_all += dpo_dataset["train"]["prompt"]
    rejected_all += dpo_dataset["train"]["rejected"]
    if "test" in dpo_dataset:
        chosen_all += dpo_dataset["test"]["chosen"]
        prompt_all += dpo_dataset["test"]["prompt"]
        rejected_all += dpo_dataset["test"]["rejected"]


Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [None]:
#@title upload to hub
import os
import datasets
from datasets import SplitGenerator, Split
from sklearn.model_selection import train_test_split


merged_df = pd.DataFrame(
        {
            "prompt": prompt_all,
            "chosen": chosen_all,
            "rejected": rejected_all
        }
    )

merged_df = merged_df[
        (merged_df['prompt'].apply(len) != 0) &
        (merged_df['chosen'].apply(len) != 0) &
        (merged_df['rejected'].apply(len) != 0)
    ]

train_df, test_df =train_test_split(
    merged_df,
    test_size=0.2,
    random_state=42
)
print(len(train_df), len(test_df))

for k, v in { "train": train_df,  "test": test_df }.items():
    Dataset.from_pandas(v).push_to_hub(
    repo_id="Gunulhona/open_dpo_merged",
    token=os.environ['HF_TOKEN'],
    split=k
)


505934 126484


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/253 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/253 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/712 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/127 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/713 [00:00<?, ?B/s]