In [6]:
from dotenv import dotenv_values
CONFIG = dotenv_values(".env")
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

import pandas as pd
import numpy as np
import re
import json
import uuid
from tqdm.auto import tqdm

from transformers import AutoTokenizer, LlamaForCausalLM
import transformers
import torch
import warnings

from huggingface_hub import login
login(CONFIG['HF'])

warnings.filterwarnings("ignore")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/paul/.cache/huggingface/token
Login successful


In [7]:
#MODEL = "meta-llama/Llama-2-70b-chat-hf"
MODEL = "meta-llama/Llama-2-13b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = LlamaForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    #load_in_4bit=True,
    #load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True,
)
model.tie_weights()

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

## Pipeline

In [8]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

In [9]:
## This is my API Call
def generate_question(instruction):
    output = pipeline(
        instruction,
        do_sample=True,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1024,
        temperature=1,
        top_p=0.95,
        top_k=1,
        return_full_text=False
    )
    return output[0]['generated_text']

## Load Data

In [10]:
#wikis = []
#for i in range(10):
#    _df = pd.read_parquet(f"./data/wiki_{i}_1k.parquet")
#    wikis.append(_df)
#df = pd.concat(wikis).drop_duplicates(subset=['page_id']).reset_index(drop=True)

In [11]:
df = pd.read_parquet("./data/wiki_20k.parquet")

In [12]:
def extract_data(text):
    question = re.findall(r'Question: (.*)\n', text)
    choice_a = re.findall(r'A[\.|\:|\)] (.*)\n', text)
    choice_b = re.findall(r'B[\.|\:|\)] (.*)\n', text)
    choice_c = re.findall(r'C[\.|\:|\)] (.*)\n', text)
    choice_d = re.findall(r'D[\.|\:|\)] (.*)\n', text)
    choice_e = re.findall(r'E[\.|\:|\)] (.*)\n', text)
    answer = re.findall(r'Answer: ([A-E])', text)

    assert len(question) > 0, "Question is missing"
    assert len(choice_a) > 0, "Choice A is missing"
    assert len(choice_b) > 0, "Choice B is missing"
    assert len(choice_c) > 0, "Choice C is missing"
    assert len(choice_d) > 0, "Choice D is missing"
    assert len(choice_e) > 0, "Choice E is missing"
    assert len(answer) > 0, "Answer E is missing"

    return {
        "prompt": question[0],
        "A": choice_a[0],
        "B": choice_b[0],
        "C": choice_c[0],
        "D": choice_d[0],
        "E": choice_e[0],
        "answer": answer[0]
    }

## Generate Questions

In [13]:
model_type = MODEL.split('/')[1].lower().replace('-', '_')
save_path = f"./{model_type}_questions"

if not os.path.exists(save_path):
    os.makedirs(save_path)

generated_data = []

for r in tqdm(df.itertuples(), total=len(df)):
    try:
        output = generate_question(r.instruction)
        data = extract_data(output)
        data['text'] = r.text
        data['title'] = r.title
        data['stem_label'] = r.stem_label
        data['page_id'] = r.page_id
        data['instruction'] = r.instruction
        
        data_id = str(uuid.uuid4())

        data['id'] = data_id
        data['model'] = model_type

        generated_data.append(data)
        
        ## Write to save our precious work
        with open(f"{save_path}/{data_id}_{model_type}.json", "w") as file:
            file.write(json.dumps(data))
    except:
        continue

generated_data = pd.DataFrame(generated_data)


  0%|          | 0/18814 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [None]:
generated_data.to_parquet(f"{model_type}_questions.parquet")

In [None]:
#for r in generated_data.itertuples():
#    print(r.text)
#    print()
#    print(f"Prompt: {r.prompt}")
#    print(f"A: {r.A}")
#    print(f"B: {r.B}")
#    print(f"C: {r.C}")
#    print(f"D: {r.D}")
#    print(f"E: {r.E}")
#    print(f"Answer: {r.answer}")
#    print("=" * 100)