In [1]:
from llama_index.llms.openai import OpenAI

with open("../keys/openai.txt", "r") as f:
    api_key = f.read()

model = "gpt-3.5-turbo-0125"
llm_openai = OpenAI(model=model, api_key=api_key, temperature=0.0)


In [10]:
from enum import Enum
from pydantic import BaseModel
from typing import List

class Label(str, Enum):
    title = 'title'
    performer = 'performer'
    other = 'other'

class MusicEntity(BaseModel):
    """Data model of a music entity"""
    utterance: str 
    label: Label
    cue: str

    class Config:  
        use_enum_values = True
        
class EntityList(BaseModel):
    """Data model for list of music entities."""
    content: List[MusicEntity]
    
EntityList(content=[])


EntityList(content=[])

# ZeroShot

In [9]:
from llama_index.program.openai import OpenAIPydanticProgram

prompt_template_str = """\
From the following text which contains a user requests for music suggestions, extract all the music entities.
A music entity has the following attributes:
    - utterance: The utterance of the entity in the text. For example "the beatles" in "recommend me music like the beatles".
    - label: The label of the entity. It can either be 'title' (eg. a song title, an album title, a symphony) or it can be 'performer' which refers to a performing musical artist.
    - cue: The contextual cue which indicates the musical entity (eg. "music like" in "recommend me music like the beatles" indicating "the beatles")
Here is the text: {text}
"""

program = OpenAIPydanticProgram.from_defaults(
    output_cls=EntityList,
    llm=llm_openai,
    prompt_template_str=prompt_template_str,
    allow_multiple=False,
    verbose=False,
)


In [10]:
import sys
sys.path.append("..")
from tqdm import tqdm
from src.Utils import read_IOB_file, transform_to_dict, write_jsonlines
import os 

# load data
dataset_id = str(1)
data_path = f"../baseline/music-ner-eacl2023/data/dataset{dataset_id}/test.bio"
texts, labels = read_IOB_file(data_path)

outputs = []
for tokens, iob in tqdm(zip(texts, labels)):

    text = ' '.join(tokens)
    true_ents = transform_to_dict(tokens, iob)

    # put input data and true entities
    output = {}
    output["text"] = text
    output["performers"] = true_ents.get("Artist") or []
    output["titles"] = true_ents.get("WoA") or []

    # extract with LLM
    ent_list = program(text=text)
    llm_ents = [ent.model_dump() for ent in ent_list.content]
    output["extracted"] = llm_ents

    outputs.append(output)

# write output    
output_dir = os.path.join("..", "output", model)
os.makedirs(output_dir, exist_ok=True)

write_jsonlines(output_dir + os.sep + f"reddit{dataset_id}.jsonl", outputs)


0it [00:01, ?it/s]


KeyboardInterrupt: 

# Few-Shot with Mixtral

In [5]:
# Just runs .complete to make sure the LLM is listening
from llama_index.llms.ollama import Ollama
from llama_index.core.program import LLMTextCompletionProgram
from llama_index.core import PromptTemplate
import sys
sys.path.append("..")
from src.Utils import read_IOB_file, transform_to_dict

# init model
llm_mixtral = Ollama(model="mixtral")

# load data
dataset_id = str(1)
test_path = f"../baseline/music-ner-eacl2023/data/dataset{dataset_id}/test.bio"
texts_test, labels_test = read_IOB_file(test_path)

dataset_id = str(1)
train_path = f"../baseline/music-ner-eacl2023/data/dataset{dataset_id}/train.bio"
texts_train, labels_train = read_IOB_file(train_path)


### Make Few-Shot Dataset from Training Set

In [11]:

class Example(BaseModel):
    """
    Data model for a few-shot example.
    """
    text: str
    output: EntityList

def entity_dict_to_pydantic(entity_dict: dict) -> EntityList:

    entity_list = EntityList(content=[])

    entity_dict["title"] = entity_dict.get("WoA")
    entity_dict["performer"] = entity_dict.get("Artist")
    for key in ["title", "performer"]:
        if entity_dict.get(key):
            for value in entity_dict[key]:
                entity_list.content.append(MusicEntity(utterance=value, label=key, cue=""))
    return entity_list

examples = []

for tokens, iob in zip(texts_train, labels_train):
    true_ents = transform_to_dict(tokens, iob)
    text = ' '.join(tokens)
    entity_list = entity_dict_to_pydantic(true_ents)
    examples.append(Example(text=text, output=entity_list))


### Make Prompt Template

In [12]:
from src.FewShot import FewShotSet

prompt_template_str = """\
From the following text which contains a user requests for music suggestions, extract all the music entities.
A music entity has the following attributes:
    - utterance: The utterance of the entity in the text. For example "the beatles" in "recommend me music like the beatles".
    - label: The label of the entity. It can either be 'title' (eg. a song title, an album title, a symphony) or it can be 'performer' which refers to a performing musical artist.
    - cue: The contextual cue which indicates the musical entity (eg. "music like" in "recommend me music like the beatles" indicating "the beatles")

Here are {k} examples: 
{few_shot_examples}
    
Here is the text: {text}
"""

few_shot_set = FewShotSet("../baseline/music-ner-eacl2023/data/dataset4/test.bio", output_instruction=True)


In [13]:
program = LLMTextCompletionProgram.from_defaults(
    output_cls=EntityList,
    guidance_llm=llm_mixtral,
    prompt=few_shot_set.get_prompt_template(),
    verbose=False,
)



In [16]:
program(text="youre a time traveller living in the 80s what future music do you put on a mixtape for your girlfriend", k=5)



EntityList(content=[])