<a href="https://colab.research.google.com/github/qiuyuejoy/Model-Projects_Java/blob/main/Copy_of_2024_04_09_Preference_Dataset_Joy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline
import random
import json
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
from torch import nn as nn
from torch.nn import functional as F
from torch import optim

In [None]:
!pip install -q -U bitsandbytes wandb datasets sentence_transformers faiss-gpu
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/trl.git
!pip install -q -U git+https://github.com/yuchenlin/LLM-Blender.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.1/266.1 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [None]:
import accelerate
import transformers
import llm_blender
import bitsandbytes

from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset, Dataset
from trl import DPOTrainer, ModelConfig, get_kbit_device_map, get_peft_config, get_quantization_config
from typing import Optional, Dict
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel

from huggingface_hub import notebook_login

# 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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
class Timer(object):
    def __init__(self):
        self.start_time = time.time()
        self.elapsed_times = [0]

    def tick(self):
        self.elapsed_times.append(time.time() - self.start_time)

    def last_elapsed_time(self):
        print('Elapsed time: {:.4f} seconds'.format(self.elapsed_times[-1]))
        return self.elapsed_times[-1]

    def reset(self):
        self.start_time = time.time()
        self.elapsed_time = [0]

def build_pipeline(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    pipeline = transformers.pipeline(
        'text-generation',
        model=model_id,
        batch_size=1,
        torch_dtype=torch.float16,
        device_map='auto',
    )
    return pipeline, tokenizer

def extract_instructions(dataset, select_train=True):
    instructions = []
    if select_train:
        selected_sources = {'writingprompts', 'nlp', 'authors'}
        dataset_train = dataset['train']
        conversations = dataset_train['conversations']
        sources = dataset_train['source']
        for i in range(dataset_train.shape[0]):
            if sources[i] in selected_sources and '?' not in conversations[i][0]:
                instructions.append(conversations[i][0])
    else:
        dataset_test = dataset['test']
        conversations = dataset_test['conversations']
        for i in range(dataset_test.shape[0]):
            if '?' not in conversations[i][0]:
                instructions.append(conversations[i][0])
    return instructions

def sample_instructions(instructions, size):
    sampled_instructions = random.sample(instructions, size)
    return sampled_instructions

def format_message(message):
    return f"""<s><<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.<</SYS>>

[INST]{message}[/INST]"""

def get_response_from_llama2(pipeline, tokenizer, message):
    formatted_message = format_message(message)
    sequences = pipeline(
        formatted_message,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        return_full_text=False,
        eos_token_id=tokenizer.eos_token_id
    )
    response = sequences[0]['generated_text'].strip()
    return response

def get_responses_for_each_instruction(instructions, number_of_responses_per_instruction):
    responses = []
    timer = Timer()
    for k, instruction in enumerate(instructions):
        print(instruction)
        response = dict()
        response['instruction'] = instruction
        response['response'] = []
        for i in range(number_of_responses_per_instruction):
            print(k, i)
            timer.tick()
            timer.last_elapsed_time()
            response['response'].append(get_response_from_llama2(pipeline, tokenizer, instruction))
        responses.append(response)
    return responses

def get_inputs_and_candidates_texts_from_responses(responses):
    inputs = []
    candidates_texts = []
    for response in responses:
        inputs.append(response['instruction'])
        candidates_texts.append(response['response'])
    return inputs, candidates_texts

def get_preference_dataset_from_PairRM(responses, json_file='Preference-Dataset-PairRM.json'):
    blender = llm_blender.Blender()
    blender.loadranker('llm-blender/PairRM')
    inputs, candidates_texts = get_inputs_and_candidates_texts_from_responses(responses)
    ranks = blender.rank(inputs, candidates_texts, return_scores=False, batch_size=1)
    preference_dataset = {
        'prompt': [],
        'rejected': [],
        'chosen': []
    }
    for i, instruction in enumerate(inputs):
        preference_dataset['prompt'].append(instruction)
        done_chosen, done_rejected = False, False
        for j in range(len(candidates_texts[i])):
            if ranks[i][j] == min(ranks[i]) and not done_chosen:
                preference_dataset['chosen'].append(candidates_texts[i][j])
                done_chosen = True
            elif ranks[i][j] == max(ranks[i]) and not done_rejected:
                preference_dataset['rejected'].append(candidates_texts[i][j])
                done_rejected = True
    save_json(preference_dataset, json_file)
    preference_dataset = Dataset.from_dict(preference_dataset)
    # preference_dataset.to_json(json_file)
    return preference_dataset

def save_json(json_object, json_file):
    with open(json_file, 'w') as f:
        json.dump(json_object, f, indent=4)

In [None]:
base_model_id = "meta-llama/Llama-2-7b-chat-hf"
lima_dataset = "GAIR/lima"
huggingface_username = 'Joyqiuyue'
preference_dataset_name = f'{huggingface_username}/lima_preference_dataset'
pipeline, tokenizer = build_pipeline(base_model_id)
lima_dataset = load_dataset(lima_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1030 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
number_of_instructions = 50
number_of_responses_per_instruction = 5
instructions = extract_instructions(lima_dataset)
sampled_instructions = sample_instructions(instructions, number_of_instructions)
responses = get_responses_for_each_instruction(sampled_instructions, number_of_responses_per_instruction)
preference_dataset = get_preference_dataset_from_PairRM(responses)

I got a parking ticket due to forgetting to place my parking permit. Please draft an appeal letter for me.
0 0
Elapsed time: 0.0001 seconds
0 1
Elapsed time: 23.8501 seconds
0 2
Elapsed time: 52.6166 seconds
0 3
Elapsed time: 80.5494 seconds
0 4
Elapsed time: 107.6451 seconds
A man is wrongly sentenced to death in Victorian England for supposedly killing a milk-maid, write a letter from him to his wife.
1 0
Elapsed time: 134.7119 seconds
1 1
Elapsed time: 165.3975 seconds
1 2
Elapsed time: 192.0551 seconds
1 3
Elapsed time: 220.6555 seconds
1 4
Elapsed time: 246.1023 seconds
You are a galaxy renowned xenozoologist, and are determined to make accurate care guides for all of the pets of galactic citizens. Your current goal is to write a guide for the new pet that everyone's going crazy over: humans.
2 0
Elapsed time: 266.9832 seconds


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    a

2 1
Elapsed time: 318.2273 seconds
2 2
Elapsed time: 354.0898 seconds
2 3
Elapsed time: 378.4881 seconds
2 4
Elapsed time: 395.1058 seconds
Out of boredom, you write an email to yourself scheduled to be sent in 3 years. What you didn’t expect was a reply the very next morning, by future you.
3 0
Elapsed time: 442.9622 seconds
3 1
Elapsed time: 471.4783 seconds
3 2
Elapsed time: 504.9290 seconds
3 3
Elapsed time: 530.7718 seconds
3 4
Elapsed time: 556.9578 seconds
In sixty words, write what can happen in a second.
4 0
Elapsed time: 586.4939 seconds
4 1
Elapsed time: 589.2105 seconds
4 2
Elapsed time: 591.5163 seconds
4 3
Elapsed time: 593.3517 seconds
4 4
Elapsed time: 595.8324 seconds
The protagonist of a story writes a letter to the author to complain about how bad the story is.
5 0
Elapsed time: 598.3146 seconds
5 1
Elapsed time: 609.8893 seconds
5 2
Elapsed time: 626.6851 seconds
5 3
Elapsed time: 649.3761 seconds
5 4
Elapsed time: 664.3824 seconds
Write a sentence about sports wher



Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/130 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

ranker_config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Successfully loaded ranker from  /root/.cache/huggingface/hub/llm-blender/PairRM



Ranking candidates:   0%|          | 0/50 [00:00<?, ?it/s][A
Ranking candidates:   2%|▏         | 1/50 [00:01<01:13,  1.50s/it][A
Ranking candidates:   4%|▍         | 2/50 [00:02<01:09,  1.45s/it][A
Ranking candidates:   6%|▌         | 3/50 [00:05<01:28,  1.89s/it][A
Ranking candidates:   8%|▊         | 4/50 [00:06<01:16,  1.66s/it][A
Ranking candidates:  10%|█         | 5/50 [00:07<01:02,  1.40s/it][A
Ranking candidates:  12%|█▏        | 6/50 [00:08<00:54,  1.23s/it][A
Ranking candidates:  14%|█▍        | 7/50 [00:09<00:48,  1.12s/it][A
Ranking candidates:  16%|█▌        | 8/50 [00:10<00:44,  1.06s/it][A
Ranking candidates:  18%|█▊        | 9/50 [00:11<00:49,  1.22s/it][A
Ranking candidates:  20%|██        | 10/50 [00:12<00:45,  1.13s/it][A
Ranking candidates:  22%|██▏       | 11/50 [00:13<00:41,  1.07s/it][A
Ranking candidates:  24%|██▍       | 12/50 [00:15<00:47,  1.24s/it][A
Ranking candidates:  26%|██▌       | 13/50 [00:16<00:42,  1.14s/it][A
Ranking candidates:  28

In [None]:
preference_dataset.push_to_hub(preference_dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/342 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Joyqiuyue/lima_preference_dataset/commit/a21c55fa48615792193e970f2247e870aebf911c', commit_message='Upload dataset', commit_description='', oid='a21c55fa48615792193e970f2247e870aebf911c', pr_url=None, pr_revision=None, pr_num=None)