# Fixed Prompt Generation

In [1]:
sample_data = {
    "age": 90,
    "workclass": "?",
    "fnlwgt": 77053,
    "education": "HS-grad",
    "education.num": 9,
    "marital.status": "Widowed",
    "occupation": "?",
    "relationship": "Not-in-family",
    "race": "White",
    "sex": "Female",
    "capital.gain": 0,
    "capital.loss": 4356,
    "hours.per.week": 40,
    "native.country": "United-States",
}

sample_label = "income"
check_answer = lambda x: int(x) <= 50000

In [None]:
def fixed_prompt(data, target_label):
    prompt = f"Based on the input data, predict the value for '{target_label}'.\n"
    prompt += f"Input data:\n{data}\n"
    prompt += f"Output ONLY a JSON object with the key '{target_label}' and its predicted value. Do not include any other text, explanations, or units."
    prompt += f"\nJSON Output:\n"
    return prompt

In [None]:
sample_prompt = fixed_prompt(sample_data, sample_label)
print(sample_prompt)

# Model Sampling

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
access_token = os.getenv("HUGGINGFACE_ACCESS_TOKEN")

In [3]:
import torch
from transformers import pipeline, BitsAndBytesConfig
import warnings

warnings.filterwarnings("ignore")

In [4]:
config = {"quantization_config": BitsAndBytesConfig(load_in_8bit=True)}
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
generator = pipeline(
    "text-generation",
    model=MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    model_kwargs=config,
    token=access_token,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


In [None]:
def predict_with_llm(prompt):
    outputs = generator(
        prompt,
        max_new_tokens=40,
        num_return_sequences=1,
        return_full_text=False,
        pad_token_id=generator.tokenizer.eos_token_id,
    )
    generated_text = outputs[0]["generated_text"]
    return generated_text

In [None]:
import json
import time

attempt = 0
result = 0

while attempt < 25:
    start_time = time.perf_counter()
    try:
        generated_text = predict_with_llm(sample_prompt)
        predicted = json.loads(generated_text)
    except Exception:
        continue

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    result += int(check_answer(predicted[sample_label]))
    attempt += 1

    print(f"Elapsed Time: {elapsed_time: 2f}s")
    print("Generated Text:", generated_text)
    print("Accuray:", result / attempt)
    print()

# Baseline Test

In [4]:
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

generator = pipeline(
    "text-generation",
    model=MODEL_NAME,
    device_map="cuda",
    token=access_token,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


In [16]:
def get_prompt(x):
    outputs = generator(
        x,
        num_return_sequences=1,
        max_new_tokens=1024,
        return_full_text=False,
        pad_token_id=generator.tokenizer.eos_token_id,
    )
    return outputs[0]["generated_text"]

In [20]:
base_prompt = f"Serialize the following data into sentence. Do not include any other outputs not related to the data. \nData:\n {sample_data}"
new_prompt = get_prompt(base_prompt)

In [21]:
print(new_prompt)




## Step 1: Identify the key components of the data that need to be serialized into a sentence.
The data contains various attributes such as age, workclass, education, marital status, occupation, and others that need to be included in the sentence.

## Step 2: Determine the order of the attributes to include in the sentence.
The sentence should start with the age, followed by the workclass, education, marital status, occupation, and then the remaining attributes.

## Step 3: Replace the missing values in the sentence.
The workclass and occupation are marked as '?' and should be replaced with a suitable phrase, such as "unknown" or "not specified".

## Step 4: Serialize the data into a sentence.
Here is the serialized data: "At 90 years old, the individual had an unknown workclass, completed high school, was widowed, had an unknown occupation, was not in a family relationship, identified as White, female, had no capital gain, experienced a capital loss of $4356, worked 40 hours a week