**Creating a Custom Stanford Alpaca Style Dataset for Model**


1.   Project developed around Stanford Alpaca's project
2.   Using OpenAI to generate dataset



In [1]:
##Using GPT to generate more prompts
!pip -q install openai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
##Working around Stanford's project
!git clone https://github.com/tatsu-lab/stanford_alpaca.git
%cd stanford_alpaca
!pip install -r requirements.txt

Cloning into 'stanford_alpaca'...
remote: Enumerating objects: 129, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 129 (delta 57), reused 50 (delta 50), pack-reused 54[K
Receiving objects: 100% (129/129), 9.14 MiB | 5.31 MiB/s, done.
Resolving deltas: 100% (62/62), done.
/content/stanford_alpaca
Collecting rouge_score (from -r requirements.txt (line 2))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire (from -r requirements.txt (line 3))
  Downloading fire-0.5.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.28.1 (from -r requirements.txt (line 5))
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2

In [3]:
import os
import openai

#Use private OpenAI API Key
openai.api_key =''
os.environ['OPENAI_API_KEY'] =''

In [4]:
##Data generation process
"""
batch_selfinstruct_generate.py
run:
python -m generate_instruction generate_instruction_following_data \
  --output_dir ./ \
  --num_instructions_to_generate 10 \
  --model_name="text-davinci-003" \
"""

import os
import json
import time
import re
import string
import random
from functools import partial
from multiprocessing import Pool
import numpy as np
import tqdm
from rouge_score import rouge_scorer
import utils
import fire

In [5]:
##Generation Process from Stanford Alpaca

def encode_prompt(prompt_instructions):
    """Encode multiple prompt instructions into a single string."""
    prompt = open("./prompt.txt").read() + "\n"

    for idx, task_dict in enumerate(prompt_instructions):
        (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        input = "<noinput>" if input.lower() == "" else input
        prompt += f"###\n"
        prompt += f"{idx + 1}. Instruction: {instruction}\n"
        prompt += f"{idx + 1}. Input:\n{input}\n"
        prompt += f"{idx + 1}. Output:\n{output}\n"
    prompt += f"###\n"
    prompt += f"{idx + 2}. Instruction:"
    return prompt


def post_process_gpt3_response(num_prompt_instructions, response):
    if response is None:
        return []
    raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
    raw_instructions = re.split("###", raw_instructions)
    instructions = []
    for idx, inst in enumerate(raw_instructions):
        # if the decoding stops due to length, the last example is likely truncated so we discard it
        if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length":
            continue
        idx += num_prompt_instructions + 1
        splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)
        if len(splitted_data) != 7:
            continue
        else:
            inst = splitted_data[2].strip()
            input = splitted_data[4].strip()
            input = "" if input.lower() == "<noinput>" else input
            output = splitted_data[6].strip()
        # filter out too short or too long instructions
        if len(inst.split()) <= 3 or len(inst.split()) > 150:
            continue
        # filter based on keywords that are not suitable for language models.
        blacklist = [
            "image",
            "images",
            "graph",
            "graphs",
            "picture",
            "pictures",
            "file",
            "files",
            "map",
            "maps",
            "draw",
            "plot",
            "go to",
            "video",
            "audio",
            "music",
            "flowchart",
            "diagram",
        ]
        blacklist += []
        if any(find_word_in_string(word, inst) for word in blacklist):
            continue
        # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.
        # And it's a bit comfusing whether the model need to write a program or directly output the result.
        # Here we filter them out.
        # Note this is not a comprehensive filtering for all programming instructions.
        if inst.startswith("Write a program"):
            continue
        # filter those starting with punctuation
        if inst[0] in string.punctuation:
            continue
        # filter those starting with non-english character
        if not inst[0].isascii():
            continue
        instructions.append({"instruction": inst, "input": input, "output": output})
    return instructions


def find_word_in_string(w, s):
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)


def generate_instruction_following_data(
    output_dir="./",
    seed_tasks_path="./seed_tasks.jsonl",
    num_instructions_to_generate=100,
    model_name="text-davinci-003",
    num_prompt_instructions=3,
    request_batch_size=5,
    temperature=1.0,
    top_p=1.0,
    num_cpus=16,
):
    seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
    seed_instruction_data = [
        {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
        for t in seed_tasks
    ]
    print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")

    os.makedirs(output_dir, exist_ok=True)
    request_idx = 0
    # load the LM-generated instructions
    machine_instruction_data = []
    if os.path.exists(os.path.join(output_dir, "regen.json")):
        machine_instruction_data = utils.jload(os.path.join(output_dir, "regen.json"))
        print(f"Loaded {len(machine_instruction_data)} machine-generated instructions")

    # similarities = {}
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

    # now let's generate new instructions!
    progress_bar = tqdm.tqdm(total=num_instructions_to_generate)
    if machine_instruction_data:
        progress_bar.update(len(machine_instruction_data))

    # first we tokenize all the seed instructions and generated machine instructions
    all_instructions = [d["instruction"] for d in seed_instruction_data] + [
        d["instruction"] for d in machine_instruction_data
    ]
    all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions]

    while len(machine_instruction_data) < num_instructions_to_generate:
        request_idx += 1

        batch_inputs = []
        for _ in range(request_batch_size):
            # only sampling from the seed tasks
            prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)
            prompt = encode_prompt(prompt_instructions)
            batch_inputs.append(prompt)
        decoding_args = utils.OpenAIDecodingArguments(
            temperature=temperature,
            n=1,
            max_tokens=3072,  # hard-code to maximize the length. the requests will be automatically adjusted
            top_p=top_p,
            stop=["\n20", "20.", "20."],
        )
        request_start = time.time()
        results = utils.openai_completion(
            prompts=batch_inputs,
            model_name=model_name,
            batch_size=request_batch_size,
            decoding_args=decoding_args,
            logit_bias={"50256": -100},  # prevent the <|endoftext|> token from being generated
        )
        request_duration = time.time() - request_start

        process_start = time.time()
        instruction_data = []
        for result in results:
            new_instructions = post_process_gpt3_response(num_prompt_instructions, result)
            instruction_data += new_instructions

        total = len(instruction_data)
        keep = 0
        for instruction_data_entry in instruction_data:
            # computing similarity with the pre-tokenzied instructions
            new_instruction_tokens = scorer._tokenizer.tokenize(instruction_data_entry["instruction"])
            with Pool(num_cpus) as p:
                rouge_scores = p.map(
                    partial(rouge_scorer._score_lcs, new_instruction_tokens),
                    all_instruction_tokens,
                )
            rouge_scores = [score.fmeasure for score in rouge_scores]
            most_similar_instructions = {
                all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1]
            }
            if max(rouge_scores) > 0.7:
                continue
            else:
                keep += 1
            instruction_data_entry["most_similar_instructions"] = most_similar_instructions
            instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores))
            machine_instruction_data.append(instruction_data_entry)
            all_instructions.append(instruction_data_entry["instruction"])
            all_instruction_tokens.append(new_instruction_tokens)
            progress_bar.update(1)
        process_duration = time.time() - process_start
        print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s")
        print(f"Generated {total} instructions, kept {keep} instructions")
        utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))


In [6]:
##Importing Data

import json

file_path = "/content/stanford_alpaca/alpaca_data.json"

with open(file_path, 'r') as json_file:
    data = json.load(json_file)

In [7]:
##Seeding Tasks
seed_tasks_path="./seed_tasks.jsonl"

seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]

seed_instruction_data = [
    {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
    for t in seed_tasks
]
print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")

Loaded 175 human-written seed instructions


In [8]:
##Creating a new (custom) set of seed tasks


new_seed_tasks = seed_tasks[:3]
new_seed_tasks

[{'id': 'seed_task_0',
  'name': 'breakfast_suggestion',
  'instruction': "Is there anything I can eat for a breakfast that doesn't include eggs, yet includes protein, and has roughly 700-1000 calories?",
  'instances': [{'input': '',
    'output': 'Yes, you can have 1 oatmeal banana protein shake and 4 strips of bacon. The oatmeal banana protein shake may contain 1/2 cup oatmeal, 60 grams whey protein powder, 1/2 medium banana, 1tbsp flaxseed oil and 1/2 cup watter, totalling about 550 calories. The 4 strips of bacon contains about 200 calories.'}],
  'is_classification': False},
 {'id': 'seed_task_1',
  'name': 'antonym_relation',
  'instruction': 'What is the relation between the given pairs?',
  'instances': [{'input': 'Night : Day :: Right : Left',
    'output': 'The relation between the given pairs is that they are opposites.'}],
  'is_classification': False},
 {'id': 'seed_task_2',
  'name': 'one_sentence_description',
  'instruction': 'Generate a one-sentence description for ea

In [9]:
##New Data (relevant to company)
json_new_tasks = '''
[
    {
        "id": "new_seed_task_3",
        "name": "insurance_policies_offered",
        "instruction": "What types of insurance policies do you offer?",
        "instances": [{"input": "",
                      "output": "we offer a wide range of insurance policies, including life insurance, health insurance, motor insurance, travel insurance, and more. Our policies are designed to meet various coverage needs., tofu stir-fry, tempeh tacos, and quinoa stuffed peppers."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_4",
        "name": "file_claim_insurance",
        "instruction": "How can I file a claim for my car insurance?",
        "instances": [{"input": "",
                      "output": "To file a claim for your car insurance, please reach out to our claims department at [contact information]. They will guide you through the claims process and assist you with the necessary documentation."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_5",
        "name": "factors_premium_policy",
        "instruction": "What factors affect the premium amount for a health insurance policy?",
        "instances": [{"input": "",
                      "output": "Several factors impact the premium amount for a health insurance policy, including your age, medical history, coverage type, sum insured, and any additional riders you choose. These factors are taken into account while calculating your premium."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_6",
        "name": "compare_insurance_plans",
        "instruction": "Can you help me compare different life insurance plans?",
        "instances": [{"input": "",
                      "output": "Absolutely! Our team of experts can assist you in comparing various life insurance plans based on factors like coverage, premium, benefits, and riders. Please provide us with some details, and we will guide you through the comparison process."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_7",
        "name": "claim_settlement_ratio",
        "instruction": "What is the claim settlement ratio of your company?",
        "instances": [{"input": "",
                      "output": "We have a high claim settlement ratio, which indicates our commitment to efficiently settling valid claims. Our team strives to ensure a smooth and hassle-free claim process for our customers."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_8",
        "name": "documents_motor_insurance",
        "instruction": "What documents are required to purchase a motor insurance policy?",
        "instances": [{"input": "",
                      "output": "To purchase a motor insurance policy, you will typically need documents such as vehicle registration certificate, driving license, address proof, and previous insurance details (if any). Our team can provide you with a detailed checklist based on your specific requirements."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_9",
        "name": "great_experience",
        "instruction": "I had a great experience with [Company]. The customer support was prompt and helpful. Thank you!",
        "instances": [{"input": "",
                      "output": "Thank you for your kind words! We're thrilled to hear that you had a positive experience with our customer support team. We strive to provide excellent service and are here to assist you whenever you need."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_10",
        "name": "poor_experience",
        "instruction": "I  am extremely dissatisfied with the claim process. It was confusing and time-consuming. I expected better service.",
        "instances": [{"input": "",
                      "output": "We apologize for the inconvenience you faced during the claim process. We understand your frustration, and we appreciate your feedback. We value your experience, and we'll use this feedback to improve our processes and ensure smoother claim handling in the future."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_11",
        "name": "poor_feedback",
        "instruction": "The premium for my car insurance policy increased significantly during the renewal. I wasn't informed about this in advance, and it came as a surprise. Disappointed with the lack of transparency.",
        "instances": [{"input": "",
                      "output": "We apologize for the inconvenience caused due to the sudden increase in the premium. We understand your concerns regarding transparency, and we appreciate your feedback. We'll review our communication processes to ensure that customers are informed about any changes in premiums well in advance."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_12",
        "name": "life_insurance_policy",
        "instruction": "Can you provide details about your term life insurance policy?",
        "instances": [{"input": "",
                      "output": "Our term life insurance policy offers financial protection to your loved ones in the event of your untimely demise. It provides a lump sum payout to the nominee if the insured passes away during the policy term. The policy can be customized based on your coverage needs, and it offers affordable premiums."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_13",
        "name": "health_insurance_policy",
        "instruction": "What are the key features of your health insurance plans?",
        "instances": [{"input": "Our health insurance plans provide coverage for medical expenses incurred due to illnesses, accidents, hospitalization, and more. Key features include cashless treatment at network hospitals, coverage for pre and post-hospitalization expenses, optional add-ons like critical illness cover, and access to a wide network of healthcare providers.",
                      "output": ""}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_14",
        "name": "motor_insurance_policy",
        "instruction": "Tell me about your motor insurance policies. What types of vehicles do you cover?",
        "instances": [{"input": "",
                      "output": "Our motor insurance policies cover a range of vehicles, including cars, two-wheelers, commercial vehicles, and more. The policies provide protection against damages, theft, and third-party liabilities. We offer comprehensive coverage options tailored to suit your specific vehicle type and requirements."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_15",
        "name": "travel_insurance_policy",
        "instruction": "What benefits does your travel insurance policy offer?",
        "instances": [{"input": "",
                      "output": "Our travel insurance policy provides coverage for various travel-related risks such as trip cancellation, medical emergencies, lost baggage, and personal liability. It offers assistance services during emergencies, 24/7 customer support, and coverage for both domestic and international travel."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_16",
        "name": "riders_available",
        "instruction": "Can you explain the different riders available with your life insurance policies?",
        "instances": [{"input": "",
                      "output": "We offer several riders that can enhance the coverage of your life insurance policy. Some common riders include critical illness cover, accidental death benefit, waiver of premium, and income benefit rider. These riders provide additional financial protection and can be customized based on your needs."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_17",
        "name": "interested_health_insurance",
        "instruction": "I'm interested in purchasing a health insurance policy. Can you provide me with information about the coverage options?",
        "instances": [{"input": "",
                      "output": "Of course! We offer a variety of health insurance coverage options tailored to meet your needs. Our policies provide coverage for hospitalization expenses, pre and post-hospitalization, day care procedures, and more. We can help you choose the right coverage based on your requirements and budget."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_18",
        "name": "assistance_filing_claim",
        "instruction": "I need assistance with filing a claim for my car insurance. What is the process?",
        "instances": [{"input": "",
                      "output": "We're here to help! To file a claim for your car insurance, please gather all the necessary documentation, including the claim form, supporting evidence, and relevant invoices. You can then submit the claim through our online portal or contact our claims department directly. Our team will guide you through the process and assist you in resolving any queries."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_19",
        "name": "change_policy_terms",
        "instruction": "I received an email regarding a change in my policy terms. Can you clarify the details?",
        "instances": [{"input": "",
                      "output": "Certainly! We understand your concern. Please forward us the email or provide us with the relevant details from the communication you received. Our team will review the changes and provide you with a detailed explanation to address any confusion or queries you may have."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_20",
        "name": "update_contact_information",
        "instruction": "I need to update my contact information on my insurance policy. How can I proceed?",
        "instances": [{"input": "",
                      "output": "We appreciate your request to update your contact information. To ensure a smooth process, please share your policy details and the updated contact information with us. You can do this by reaching out to our customer support team through our helpline or by visiting our website. We'll update your information promptly and confirm the changes."}],
        "is_classification": false
    }
]
'''

In [10]:
with open('new_seed_tasks.jsonl', 'w') as outfile:
    for task_dict in new_seed_tasks:
        json.dump(task_dict, outfile)
        outfile.write('\n')

In [None]:
##Generation using OpenAI
# Ensure to check plan with OpenAI to run the same
!mkdir new_tasks
!python -m generate_instruction generate_instruction_following_data --output_dir ./new_tasks/ --seed_tasks_path ./new_seed_tasks.jsonl --num_instructions_to_generate 1 --num_prompt_instructions 3 --request_batch_size 2 --num_cpus 4

In [None]:
generate_instruction_following_data(
    output_dir="./new_tasks/",
    seed_tasks_path="./new_seed_tasks.jsonl",
    num_instructions_to_generate=5,
    model_name="text-davinci-003",
    num_prompt_instructions=3,
    request_batch_size=2,
    temperature=1.0,
    top_p=1.0,
    num_cpus=4,
)

In [None]:
import json

file_path = "/content/stanford_alpaca/new_tasks/regen.json"

with open(file_path, 'r') as json_file:
    data = json.load(json_file)

# drop the most similar as measured by Rouge
for dictionary in data:
    dictionary.pop('most_similar_instructions', None)
    dictionary.pop('avg_similarity_score',None)


print(data)

Credits Sam Witteveen