In [1]:
import urllib.request
import json


def query_model(prompt, model="llama3", url="http://localhost:11434/api/chat"):
    # Create the data payload as a dictionary
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "options": {
            "seed": 123,
            "temperature": 0,
        }
    }

    # Convert the dictionary to a JSON formatted string and encode it to bytes
    payload = json.dumps(data).encode("utf-8")

    # Create a request object, setting the method to POST and adding necessary headers
    request = urllib.request.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    # Send the request and capture the response
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # Read and decode the response
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data
# result = query_model("What do Llamas eat?")
# print(result)

In [2]:
from datasets import load_dataset

# Example: Loading the IMDB dataset
data = load_dataset("lavita/medical-qa-datasets", 'all-processed')
data = data['train'].train_test_split(test_size=0.005)

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', '__index_level_0__'],
        num_rows: 238160
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', '__index_level_0__'],
        num_rows: 1197
    })
})

In [4]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Write a response that "
        f"appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    instruction_text + input_text

    return instruction_text + input_text

In [5]:
# import random

# for entry in data['test'][:2]:
    
#     politeness = random.choice(["polite", "impolite"])    
#     print(entry)
    # prompt = (
    #     f"Given the input `{format_input(entry)}` "
    #     f"and correct output `{entry['output']}`, "
    #     f"slightly rewrite the output to be more {politeness}."
    #     "Keep the modification minimal."
    #     "Only return return the generated response and nothing else."
    # )
    # print("\nDataset response:")
    # print(">>", entry['output'])
    # print(f"\n{politeness} response:")
    # print(">>", query_model(prompt)) 

In [6]:
import random

for i in range(8, 16):
    
    politeness = random.choice(["polite", "impolite"])    
    prompt = (
        f"Given the input `{format_input(data['test'][i])}` "
        f"and correct output `{data['test'][i]['output']}`, "
        f"slightly rewrite the output to be more {politeness}."
        "Keep the modification minimal."
        "Only return return the generated response and nothing else."
    )
    print("\nDataset response:")
    print(">>", data['test'][i]['output'])
    print(f"\n{politeness} response:")
    print(">>", query_model(prompt)) 


Dataset response:
>> hi. i can understand your concern.  chronic cough is commonly seen in bronchitis and lung infection. get a chest x-ray done if it is normal, no need to worry about lung infection. possibility of bronchitis is more in your case. so better to consult pulmonologist and get done clinical examination of respiratory system and pft (pulmonary function test). pft is needed for the diagnosis of bronchitis. it will also tell you about severity of the disease and treatment of bronchitis is based on severity only. you may need inhaled bronchodilators and inhaled corticosteroid (ics)don't worry, you will be alright. hope i have solved your query. wish you good health. thanks.

impolite response:
>> Here is the rewritten output:

`listen up. chronic cough is a common symptom of bronchitis and lung infections. get a chest x-ray done already, if it's normal, stop worrying about lung infection. your case looks more like bronchitis to me. consult a pulmonologist and get a clinical 

In [7]:
import random
from tqdm import tqdm

def generate_model_responses(json_data):
    updated_data = []
    for i, entry in enumerate(tqdm(json_data['test'], desc="Writing entries")):
        politeness = random.choice(["polite", "impolite"])    
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"slightly rewrite the output to be more {politeness}. "
            "Keep the modification minimal. "
            "Only return the generated response and nothing else."
        )
        response = query_model(prompt)
        
        if politeness == "polite":
            updated_entry = {
                "instruction": entry['instruction'],
                "input": entry["input"],
                "output": entry["output"],
                "chosen": response,
                "rejected": entry["output"]
            }
        else:
            updated_entry = {
                "instruction": entry['instruction'],
                "input": entry["input"],
                "output": entry["output"],
                "rejected": response,
                "chosen": entry["output"]
            }
        
        updated_data.append(updated_entry)
    
    return updated_data

In [8]:
# generate_model_responses(data)

In [9]:
processed_data = generate_model_responses(data)
with open("instruction-data-with-preference-1.json", "w") as file:
    json.dump(processed_data, file, indent=4)

Writing entries: 100%|███████████████████████████████████████████████████████████| 1197/1197 [2:09:33<00:00,  6.49s/it]
