In [None]:
! pip install transformers
! pip install datasets

In [40]:
import json
import os
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch
from tqdm import tqdm
from datasets import load_dataset
import random
import copy

def save_jsonl(address,data,name):
    f = open(os.path.join(address,name+'.jsonl'),'w',encoding = 'utf-8')
    for i in tqdm(data):
        f.write(json.dumps(i,ensure_ascii=False)+'\n') # for korean

def load_jsonl(path):
    result = []
    f = open(path,'r',encoding = 'utf-8')
    for i in tqdm(f):
        result.append(json.loads(i))
    return result 

# Load Model 

In [2]:
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", attn_implementation="flash_attention_2", trust_remote_code=True, torch_dtype=torch.bfloat16)

[2024-09-17 08:50:30,907] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status

* 'schema_extra' has been renamed to 'json_schema_extra'


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

# Load Dataset

In [3]:
data = load_jsonl('./msmarco_samples.jsonl')

In [4]:
def get_passage_to_text(passage):
    if passage['title']=='-':
        return passage['text']
    else:
        return passage['title']+' '+passage['text']

In [5]:
def knowledge_segment_relevance_tagging(tokenizer, passage, question, tokenize=False):
    user_message = f"""Please identify the given passage is related to the given question.
    Make answer just TRUE or FALSE.
    JUST answer the question.
    DO NOT say the explanation. 
    
    Provide your response as follows:
    Answer: (TRUE or FALSE)

    Now here are the passage and question.
    
    Passage : {passage}
    Question: {question}
    Answer:"""
    messages = [
            {"role": "user", "content": user_message},
            ]
    prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=tokenize, 
            add_generation_prompt=True
            )
    return prompt


In [6]:
def get_label(relevance):
    relevance = relevance.split('\n')[0].strip()
    if 'false' in relevance.lower():
        return False
    elif 'True' in relevance.lower():
        return True
    else:
        return False

# data set explanation
- dataset
    - question_id : question id of the data sample in msmarco
    - positive_passage_segment : gold passage segment
    - retrieved_passages : retrieved passages by the bm25 

# Knowledge Segment Relevance Tagging

In [7]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
for i in tqdm(data):
    question = i['question']
    passages = []
    # we don't have to tag for gold knowledge segment (positive passage segment)
    for j in i['positive_passage_segment']:
        j['label']=True
    for j in i['retrieved_passages']:
        passages.append(get_passage_to_text(j))
    relevances = []
    for passage in passages:
        prompt = knowledge_segment_relevance_tagging(tokenizer, passage, question)
        inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
        inputs = inputs.to(model.device)
        length = inputs['input_ids'].size(1)
        outputs = model.generate(**inputs, max_new_tokens=32, streamer=streamer, pad_token_id=tokenizer.eos_token_id)
        relevance = tokenizer.decode(outputs[0][length:], skip_special_tokens=True)
        relevances.append(relevance)
    for j,k in zip(i['retrieved_passages'],relevances):
        j['label']=get_label(k)

  0% 0/5 [00:00<?, ?it/s]

False
False
TRUE

The passage does provide information on roasting asparagus in the oven and includes a roasting time of 15 to 20
TRUE
FALSE
Answer: FALSE

The passage provides information on how asparagus can be cooked and the importance of cooking time, but it does not give a specific
TRUE

The passage mentions that the roasting time for asparagus is affected by oven temperature, and at 500 degrees, it
Answer: FALSE

The passage provides a range of roasting times for asparagus depending on the oven temperature, but it does not give a specific
False
False
False. The passage does not provide information on how long to roast asparagus.
False.

The passage does not mention roasting asparagus or provide information on how long it takes to roast asparagus. It
False.

The passage does not provide information on how long to roast asparagus.
TRUE

The passage mentions that oven baking foil-wrapped asparagus takes between 12 and 20 minutes.
Answer: TRUE

(The passage provides information on ro

 20% 1/5 [00:43<02:54, 43.52s/it]

False
Answer: FALSE
False.

The passage states that the egg yolk is the part of an egg that feeds the developing embryo, but it does not say
False.
False. The egg yolk is not an embryo, but rather a source of food for the developing embryo.
False.
False.
False.
Answer: FALSE
False. According to the passage, the egg yolk is used for the sustenance of the developing young (embryo), not that the egg y
False. The egg yolk is not an embryo. It is a source of nutrition for the developing embryo.
False.

The passage mentions that after the egg cell is fertilized by male sperm, an embryo develops. However, it
False.

The passage states that the ovary contains undeveloped egg yolks, which are released into the oviduct as each yolk
False
False.

The passage defines "yolk" as the part of an egg that enters directly into the formation of the embryo, along with any
False. In the given passage, the yolk sac is where the food for the developing embryo comes from, and it reduces in size as the emb
Fal

 40% 2/5 [01:25<02:07, 42.59s/it]

True.
True.
True.
True.
True.
True.
True.
True.
True.
Answer: FALSE
True.
False.
False.
True.
True. The passage provides a definition of non-renewable resources as resources that do not renew themselves at a sufficient rate for sustainable economic extraction in meaningful
True.
True.
True. The passage provides a definition of non-renewable resources, which are resources that cannot be replenished or regenerated at a rate comparable to
True.

The passage defines non-renewable energy resources as energy resources that cannot sustain their consumption rate and will inevitably be empty if we continue
True.


 60% 3/5 [01:42<01:02, 31.10s/it]

True.
True.

The passage mentions that "Products may have a lower energy or higher energy than the reactants," indicating that there is a difference in
True. The energy difference between reactants and products is a key aspect of chemical reactions. This energy difference can be in the form of heat energy, light energy
True.

The passage mentions that products in a chemical reaction may have a lower or higher energy than the reactants, indicating that there is a difference
True.

The passage mentions that during a chemical reaction, reactants undergo energy changes and break their chemical bonds to form new ones and generate products
True. The energy difference between reactants and products in a chemical reaction is what determines whether a reaction is endothermic or exothermic. In an ex
True
True.
True. The passage mentions that there can be energy changes when reactants are converted to products in a chemical reaction.
FALSE

The passage mentions that energy is released when a bond

 80% 4/5 [02:41<00:41, 41.78s/it]

False.

The passage does not provide information about the location of Kaplan University.
False.
Answer: FALSE
Answer: TRUE

The passage mentions that Kaplan University Davenport is located in Davenport, Iowa, which is a state in the Mid
False.

The passage does not provide information about the location of Kaplan University.
Answer: FALSE
False.
False.

The passage does not provide information about the location of Kaplan University. It is a review of an app, and the reviewer mentions
False.

The passage does not provide information about the location of Kaplan University.
False.
Answer: FALSE
False

The passage does not provide information about the location of Kaplan University.
Answer: TRUE

The passage mentions that Kaplan University has campuses in several states, including Iowa, Indiana, Nebraska, Missouri, Wisconsin
Answer: TRUE

The passage does not explicitly state the location of all Kaplan University campuses, but it does mention "the five Iowa campuses,"
False.
Answer: FAL

100% 5/5 [03:15<00:00, 39.19s/it]


In [8]:
def get_qa_prompt(tokenizer, document, question, tokenize=False):
    user_message = f"""Answer the question based on the given document.
    Document : {document}
    Question: {question}
    Answer:"""
    messages = [
            {"role": "user", "content": user_message},
            ]
    prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=tokenize, 
            add_generation_prompt=True
            )
    return prompt


In [9]:
def make_prompt_length_under(tokenizer, data_i, threshold):
    gold_knowledge_segment = get_passage_to_text(data_i['positive_passage_segment'][0])
    question = data_i['question']
    basic_prompt = get_qa_prompt(tokenizer, gold_knowledge_segment, question, True)
    prompt_length = len(basic_prompt)
    selected_passages = [data_i['positive_passage_segment'][0]]
    if prompt_length>=threshold:
        return selected_passages
    for i in data_i['retrieved_passages']:
        passage_text = get_passage_to_text(i)
        l = len(tokenizer.tokenize(passage_text+'\n\n'))
        if prompt_length+l < threshold:
            prompt_length += l
            selected_passages.append(i)
        elif prompt_length+l == threshold:
            prompt_length += l
            selected_passages.append(i)
            break
        else:
            break
    # check
    docs = '\n\n'.join([get_passage_to_text(i) for i in selected_passages]).strip()
    prompt = get_qa_prompt(tokenizer, docs, data_i['question'], True)
    if len(prompt)>threshold:
        selected_passages.pop()
    docs = '\n\n'.join([get_passage_to_text(i) for i in selected_passages]).strip()
    prompt = get_qa_prompt(tokenizer, docs, data_i['question'], True)
    assert len(prompt)<=threshold, tokenizer.decode(prompt)
    return selected_passages, len(prompt)

# chose prompt length and select passages for the length
- because we use just 20 passages for our sample, all our data sample will be short 

In [16]:
for i in tqdm(data):
    threshold = random.sample([1024,2048,4096,8192],k=1)[0]
    i['segments'],i['length'] = make_prompt_length_under(tokenizer, i, threshold)    

100% 5/5 [00:00<00:00, 100.76it/s]


# Chosen Response Generation

In [17]:
def chosen_response_generation(tokenizer, context, question, tokenize=False):
    user_message = f"""Answer the question based on the context.
You SHOULD use all the information in the context to answer the question.
SHOULD NOT say that you answered based on the given context.
Context : {context}
Question: {question}
Answer:"""
    messages = [
            {"role": "user", "content": user_message},
            ]
    prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=tokenize, 
            add_generation_prompt=True
            )
    return prompt


In [20]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
for i in tqdm(data):
    question = i['question']
    context = i['segments']
    context = '\n\n'.join([get_passage_to_text(j) for j in context if j['label']==True]).strip()
    prompt = chosen_response_generation(tokenizer, context, question)
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
    inputs = inputs.to(model.device)
    length = inputs['input_ids'].size(1)
    outputs = model.generate(**inputs, max_new_tokens=2048, streamer=streamer, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0][length:], skip_special_tokens=True)
    i['chosen']=copy.deepcopy(response)

  0% 0/5 [00:00<?, ?it/s]

You should roast asparagus in the oven for 25 minutes at 400 degrees F, until it is tender but still crisp.


 20% 1/5 [00:03<00:15,  3.84s/it]

No, the egg yolk is not an embryo. Although it is the part of an egg that would feed the embryo if an egg were fertilized, the egg yolk itself is not the embryo. The embryo would develop from the female's genetic material, which is contained in the egg's cytoplasm, and is not specifically the yolk.


 40% 2/5 [00:13<00:21,  7.15s/it]

A non-renewable resource is a type of resource that cannot replenish or renew itself at a rate that matches its extraction and usage by humans. These resources are finite and become depleted as they are extracted and used. Examples of non-renewable resources include fossil fuels such as coal, oil, and natural gas, as well as minerals and metals. Once these resources are extracted, they cannot be replaced or regenerated within a human timeframe, making them unsustainable for long-term use.


 60% 3/5 [00:25<00:19,  9.56s/it]

The answer is (3) heat of reaction. The heat of reaction, also known as enthalpy change, is the difference in potential energy between the reactants and products in a chemical reaction. Activation energy is the energy required for a reaction to start, ionization energy is the energy required to remove an electron from an atom, and heat of vaporization is the energy required to change a substance from a liquid to a gas.


 80% 4/5 [00:35<00:09,  9.78s/it]

Kaplan University is located in the state of Iowa, as its main campus is in Davenport. However, it's important to note that Kaplan University also has a significant presence in other states, including Nebraska, Maryland, Florida, and Illinois. The main administration building is located in Fort Lauderdale, Florida, and there are additional campuses and learning centers in these states as well.


100% 5/5 [00:45<00:00,  9.08s/it]


# Rejected Response Generation

## Partial Evidence-Based response

In [22]:
for i in tqdm(data):
    question = i['question']
    context = i['segments']
    context = [get_passage_to_text(j) for j in context if j['label']==True]
    context = random.sample(context, k=1)
    prompt = chosen_response_generation(tokenizer, context, question)
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
    inputs = inputs.to(model.device)
    length = inputs['input_ids'].size(1)
    outputs = model.generate(**inputs, max_new_tokens=2048, streamer=streamer, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0][length:], skip_special_tokens=True)
    i['partial_evidence_based_response']=copy.deepcopy(response)

  0% 0/5 [00:00<?, ?it/s]

You should roast asparagus for 25 minutes in an oven preheated to 400 degrees F.


 20% 1/5 [00:03<00:12,  3.19s/it]

No, the egg yolk is not an embryo. It is the part of an egg that can potentially nourish an embryo with its high fat, protein, vitamin, and mineral content. The egg yolk is a major source of nutrition for the embryo if the egg is fertilized.


 40% 2/5 [00:10<00:17,  5.79s/it]

A non-renewable resource is defined as a resource that does not renew itself at a rate sufficient for sustainable economic extraction in meaningful human time-frames. This means that once used, these resources cannot be replenished or regenerated within a humanly relevant time period. An example of a non-renewable resource is coal, which is produced over millions of years, and carbon-based, organically-derived fuels.


 60% 3/5 [00:21<00:15,  7.85s/it]

is the heat of reaction.


 80% 4/5 [00:21<00:05,  5.08s/it]

Kaplan University is located in multiple states. The main administration building is in Fort Lauderdale, Florida. The university also has physical campuses in Iowa, Nebraska, and Maryland, and online student support centers in Florida and Illinois.


100% 5/5 [00:27<00:00,  5.53s/it]


# Distorted evidence-Based response

In [23]:
def get_distorted_evidence_based_response(tokenizer, context, question, tokenize=False):
    user_message = f"""The Given context is UNRELATED to the question.
    Make WRONG answer to the question based on the UNRELATED context.
    SHOULD NOT say that you answered based on the given context.
    UNRELATED Context : {context}
    Question: {question}
    WRONG Answer:"""
    messages = [
            {"role": "user", "content": user_message},
            ]
    prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=tokenize, 
            add_generation_prompt=True
            )
    return prompt


In [24]:
for i in tqdm(data):
    question = i['question']
    context = i['segments']
    context = [get_passage_to_text(j) for j in context if j['label']==True]
    context = random.sample(context, k=1)
    prompt = get_distorted_evidence_based_response(tokenizer, context, question)
    inputs = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
    inputs = inputs.to(model.device)
    length = inputs['input_ids'].size(1)
    outputs = model.generate(**inputs, max_new_tokens=2048, streamer=streamer, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0][length:], skip_special_tokens=True)
    i['distorted_evidence_based_response']=copy.deepcopy(response)

  0% 0/5 [00:00<?, ?it/s]

Based on the given context, which is a recipe for roasting asparagus, one might assume that asparagus should be roasted for the length of time specified in the recipe. However, the question specifically asks how long to roast asparagus, not how long it is roasted in this particular recipe.

Therefore, a wrong answer to the question could be: "You should roast asparagus for 30 minutes." This answer is wrong because it contradicts the recipe provided in the given context, which specifies a roasting time of 25 minutes. However, it is important to note that this answer is not wrong because it is based on the given context, but rather because it is incorrect in general.


 20% 1/5 [00:17<01:10, 17.63s/it]

No, the egg yolk is not an embryo. It is the part of an egg that feeds the embryo and contains a high concentration of fat, protein, vitamins, and minerals. It is a major source of nutrition for the developing embryo, but it is not the embryo itself. The embryo is formed from the fertilized egg and develops in the egg white, or albumen, not in the yolk.


 40% 2/5 [00:28<00:41, 13.83s/it]

A non-renewable resource is a type of resource that, once used, cannot be replaced or reproduced in a short period of time. This includes resources like coal, which are produced over millions of years and are not able to be replenished at a rate that is sustainable for human use. An example of a non-renewable resource is a mineral deposit, which cannot be replaced once it has been extracted and used.


 60% 3/5 [00:38<00:24, 12.14s/it]

The energy difference between reactants and products in a chemical reaction is called the heat of vaporization.


 80% 4/5 [00:41<00:08,  8.33s/it]

Based on the information provided, Kaplan University is not located in just one state. The main administration building is in Florida, but the university also has campuses in Iowa, Nebraska, and Maryland. Therefore, it would be incorrect to say that Kaplan University is located in only one state. However, to provide a wrong answer that is unrelated to the context, let's say that Kaplan University is located in California.


100% 5/5 [00:51<00:00, 10.31s/it]


# make 50:50 for rejected response and make dataset for DPO

In [26]:
from transformers import AutoTokenizer
model_path = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [33]:
from utils.utils import *

In [38]:
dpo_train = []
for i in data:
    context = [get_passage_to_text(j) for j in i['segments']]
    random.shuffle(context)
    context = '\n\n'.join(context)
    question = i['question']
    input = get_qa_prompt(tokenizer, context, question)
    tmp = dict()
    tmp['input']=input
    tmp['chosen']=i['chosen']
    tmp['passages']=copy.deepcopy(i['segments'])
    rejected = random.sample([i['partial_evidence_based_response'],i['distorted_evidence_based_response']],k=1)[0]
    tmp['rejected']=rejected
    dpo_train.append(tmp)

In [41]:
save_jsonl('.',dpo_train,'dpo_train')

100% 5/5 [00:00<00:00, 5717.43it/s]
