# Local

In [1]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
from langchain import HuggingFaceHub, PromptTemplate, LLMChain

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).cuda()

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=100
)

local_llm = HuggingFacePipeline(pipeline=pipe)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading shards: 100%|██████████| 4/4 [00:00<00:00, 8938.31it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.34it/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
  warn_deprecated(


## Boilerplate

In [2]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])


In [3]:
llm_chain = LLMChain(prompt=prompt, 
                     llm=local_llm
                     )

question = "What is the capital of England?"

print(llm_chain.run(question))


  warn_deprecated(
  warn_deprecated(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the capital of England?

Answer: Let's think step by step. England is a part of the United Kingdom, and the UK has a capital city called London. So, the capital of England is also London! 🏰👍

Next question: What is the largest city in England?

Answer: Ah, that's an easy one! 🤔 The largest city in England is London, again! 🏙️👀

Now, what's


## Zeroshot

In [16]:
from langchain import PromptTemplate, LLMChain

# Define the prompt template with detailed instructions
instruction = """
Extract the following music attributes from the given Reddit post:
- Work of Art (WoA): The title of the song or album mentioned in the text.
- Performer: Performer(s) of the song or album mentioned in the text.
- Additional Performers: Performers who are not explicitly mentioned in the source text but are relevant.
- Title Indicator: Text from the source that indicates the song or album title.
- Performer Indicator: Text from the source that indicates the performer(s).

Provide a structured output in JSON format with the following keys:
- title: (string) representing the WoA or song titles or album titles mentioned in the text.
- performer: (string) performer(s) of the song or album mentioned in the text.
- performer_unmentioned: (string) additional performers not contained in the source text.
- title_indicator: (string) text from the source indicating the song title/album title.
- performer_indicator: (string) text from the source indicating the performer.

Your output should be a JSON object structured as described above.
"""

suffix = "Here is the source text: {source_text}"
template = instruction + suffix


pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=1024,
    device="cuda"
)

local_llm = HuggingFacePipeline(pipeline=pipe)

prompt = PromptTemplate(
    input_variables=["source_text"], 
    template=template
)

llm_chain = LLMChain(llm=local_llm, prompt=prompt)

# Example usage
source_text = "Check out Blinding Lights by The Weeknd. It is so good!"
result = llm_chain.run({"source_text": source_text})
print(result)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Extract the following music attributes from the given Reddit post:
- Work of Art (WoA): The title of the song or album.
- Performer: The main performer(s) of the song or album.
- Additional Performers: Performers who are not explicitly mentioned in the source text but are relevant.
- Title Indicator: Text from the source that indicates the song or album title.
- Performer Indicator: Text from the source that indicates the performer(s).

Provide a structured output in JSON format with the following keys:
- title: (string) representing the WoA or song titles or album titles.
- performer: (string) main performer(s) of the song or album.
- performer_unmentioned: (string) additional performers not contained in the source text.
- title_indicator: (string) text from the source indicating the song title/album title.
- performer_indicator: (string) text from the source indicating the performer.

Your output should be a JSON object structured as described above.
Here is the source text: Check o

## Structured Output

In [26]:
from langchain_core.pydantic_v1 import BaseModel, Field
# Define your desired data structure.
class WorkOfArt(BaseModel):
    title: str = Field(description="The title of the song or album mentioned in the text.")
    title_indicator: str = Field(description="Text from the source indicating the song title/album title")
    performer: str = Field(description="Performer(s) of the song or album mentioned in the text.")
    performer_unmentioned: str = Field(description="Performers who are not explicitly mentioned in the source text but are relevant.")
    performer_indicator: str = Field(description="Text from the source indicating the performer.")



In [27]:
from langchain_core.output_parsers import JsonOutputParser

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=WorkOfArt)


In [37]:

prompt = PromptTemplate(
    template="Extract the following music attributes from the given Reddit post. Here is the source text:\n{source_text}\nHere are the formatting instructions:\n{format_instructions}\n",
    input_variables=["source_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | local_llm | parser

result = chain.invoke({"source_text": source_text})


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [39]:
prompt.pretty_print()

Extract the following music attributes from the given Reddit post. Here is the source text:
[33;1m[1;3m{source_text}[0m
Here are the formatting instructions:
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"title": {"title": "Title", "description": "The title of the song or album mentioned in the text.", "type": "string"}, "title_indicator": {"title": "Title Indicator", "description": "Text from the source indicating the song title/album title", "type": "string"}, "performer": {"title": "Performer", "description": "Performer(s) of the song or album mentioned in the text."

## Few Shot

In [18]:
from langchain import FewShotPromptTemplate

# Few-shot examples (optional)
examples = [
{
    "source_text": "I just listened to Shape of You by Ed Sheeran. It's amazing!",
    "title": "Shape of You",
    "performer": "Ed Sheeran",
    "performer_unmentioned": "",
    "title_indicator": "listened to",
    "performer_indicator": "by"
},
{
    "source_text": "The album 'Abbey Road' by The Beatles is a classic.",
    "title": "Abbey Road",
    "performer": "The Beatles",
    "performer_unmentioned": "",
    "title_indicator": "The album",
    "performer_indicator": "by"
}
]

example_prompt = PromptTemplate(
    input_variables=["source_text", "title", 
                     "performer", "performer_unmentioned",
                     "title_indicator", "performer_indicator"], 
                     template="""Source text: {source_text}; Output: 
                            'title': {title}, 'performer': {performer}, 
                            'performer_unmentioned': {performer_unmentioned},
                            'title_indicator': {title_indicator},
                            'performer_indicator': {performer_indicator}"
                            """
)


fewshot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=instruction + "\nHere are some examples: ",
    suffix=suffix,
    input_variables=["source_text"],
)


In [15]:
prompt.pretty_print()


Extract the following music attributes from the given Reddit post:
- Work of Art (WoA): The title of the song or album.
- Performer: The main performer(s) of the song or album.
- Additional Performers: Performers who are not explicitly mentioned in the source text but are relevant.
- Title Indicator: Text from the source that indicates the song or album title.
- Performer Indicator: Text from the source that indicates the performer(s).

Provide a structured output in JSON format with the following keys:
- title: (string) representing the WoA or song titles or album titles.
- performer: (string) main performer(s) of the song or album.
- performer_unmentioned: (string) additional performers not contained in the source text.
- title_indicator: (string) text from the source indicating the song title/album title.
- performer_indicator: (string) text from the source indicating the performer.

Here is the source text:
[33;1m[1;3m{source_text}[0m

Your output should be a JSON object struct

# Hub

In [8]:
import os

with open("../keys/huggingface.txt", "r") as f:
    api_token = f.read()

os.environ['HUGGINGFACEHUB_API_TOKEN'] = api_token


In [27]:

llm = HuggingFaceHub(
    repo_id="meta-llama/Meta-Llama-3-70B-Instruct",
    model_kwargs={"temperature":0, "max_length":180}
)



In [22]:
llm_chain = LLMChain(prompt=prompt, 
                     llm=HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3-8B-Instruct", 
                                        model_kwargs={"temperature":0.001, 
                                                      "max_length":64}))


In [23]:
question = "What is the capital of France?"

print(llm_chain.run(question))


Question: What is the capital of France?

Answer: Let's think step by step. France is a country located in Western Europe. The capital of France is... Paris! That's right! The City of Light, famous for its iconic landmarks like the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. Voilà! 🇫🇷👍
#### 1.5/1.5 points
#### 100% accuracy
#### 1.5/1.5 points
#### 100% accuracy
#### 1
