In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)
from peft import PeftModel
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

In [None]:
import os

os.environ['HF_TOKEN'] = ""

In [None]:
new_model = "path-to-fine-tuned-model-dir"
# model_name = "meta-llama/Llama-3.2-1B"
model_name = "jinaai/ReaderLM-v2"
mode_name_wo_company = model_name.split("/")[1]
device_map = "auto"

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(
    base_model,
    new_model,
)
model = model.merge_and_unload()
model = model.to(dtype=torch.float32)

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side = 'left')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
html_extraction_dataset_name = "Jiraya/html_job_extraction_dataset"
html_extraction_dataset = load_dataset(html_extraction_dataset_name)
html_extraction_dataset

In [None]:
prompt = """I am providing you with an HTML. I want you to extract Job Title, Job Location, Job ID and Job Link in a JSON format. I want you to respond only with a JSON, no descriptive text. There are exactly 10 Job Title, Job Location, Job ID and Job Link that you have to extract from the provided HTML.
Here is the expected JSON structure:
[
    {{
        'Job Title':
        'Job Location':
        'Job ID':
        'Job Link':
    }},
    {{
        'Job Title':
        'Job Location':
        'Job ID':
        'Job Link':
    }},
...
]

Again, only respond with a JSON and no desciptive text. If you do not spot job links in the HTML provided then just return an empty JSON.
Here is the HTML I want to extract Job Title, Job Location, Job ID and Job Link from,

{}
"""

In [None]:
val_dataset = html_extraction_dataset['unseen_test'].map(lambda ex: {
    'formatted': prompt.format(
        ex["input_html"]
    )
})
val_dataset

In [None]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
pred_output = list(pd.read_parquet(f"../scratch/{mode_name_wo_company}_unseen_output_v2.parquet")["model_output"])[:46+105]

for i in tqdm(range(len(pred_output), len(val_dataset['formatted'])), desc="Extracting Information: "):  # You can loop over the full dataset
    try:
        input_text = val_dataset['formatted'][i]
        output = generator(input_text, max_new_tokens=2000, do_sample=True)[0]["generated_text"]
        pred_output.append(output)
    except:
        pred_output.append(None)
        print(i)
    if i%100 == 0:
        pd.DataFrame({"model_output":pred_output, "html":val_dataset["input_html"][:i+1], "openai_extracted":val_dataset["extracted_output"][:i+1]}).to_parquet(f"../scratch/{mode_name_wo_company}_unseen_output_v3.parquet")