## Run inference on the Llama 2 endpoint you have created.

In [None]:
import json
import boto3

### Supported Parameters

***
This model supports many parameters while performing inference. They include:

* **max_length:** Model generates text until the output length (which includes the input context length) reaches `max_length`. If specified, it must be a positive integer.
* **max_new_tokens:** Model generates text until the output length (excluding the input context length) reaches `max_new_tokens`. If specified, it must be a positive integer.
* **num_beams:** Number of beams used in the greedy search. If specified, it must be integer greater than or equal to `num_return_sequences`.
* **no_repeat_ngram_size:** Model ensures that a sequence of words of `no_repeat_ngram_size` is not repeated in the output sequence. If specified, it must be a positive integer greater than 1.
* **temperature:** Controls the randomness in the output. Higher temperature results in output sequence with low-probability words and lower temperature results in output sequence with high-probability words. If `temperature` -> 0, it results in greedy decoding. If specified, it must be a positive float.
* **early_stopping:** If True, text generation is finished when all beam hypotheses reach the end of sentence token. If specified, it must be boolean.
* **do_sample:** If True, sample the next word as per the likelihood. If specified, it must be boolean.
* **top_k:** In each step of text generation, sample from only the `top_k` most likely words. If specified, it must be a positive integer.
* **top_p:** In each step of text generation, sample from the smallest possible set of words with cumulative probability `top_p`. If specified, it must be a float between 0 and 1.
* **return_full_text:** If True, input text will be part of the output generated text. If specified, it must be boolean. The default value for it is False.
* **stop**: If specified, it must be a list of strings. Text generation stops if any one of the specified strings is generated.

We may specify any subset of the parameters mentioned above while invoking an endpoint. Next, we show an example of how to invoke endpoint with these arguments.

**NOTE**: If `max_new_tokens` is not defined, the model may generate up to the maximum total tokens allowed, which is 4K for these models. This may result in endpoint query timeout errors, so it is recommended to set `max_new_tokens` when possible. For 7B, 13B, and 70B models, we recommend to set `max_new_tokens` no greater than 1500, 1000, and 500 respectively, while keeping the total number of tokens less than 4K.

***

In [None]:
payloads = []
#for prompt in zero_shot_prompts:
payloads.append(
    {
        "inputs": first_row_prompt, 
        "parameters": {"max_new_tokens": 64, "top_p": 0.1, "temperature": 1.0, "return_full_text": True},
    }
)

### Query endpoint that you have created

---

In [None]:
endpoint_name = 'jumpstart-dft-meta-textgeneration-llama-2-7b'


def query_endpoint(payload):
    client = boto3.client("sagemaker-runtime")
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Body=json.dumps(payload),
    )
    response = response["Body"].read().decode("utf8")
    response = json.loads(response)
    return response

In [None]:
for payload in payloads:
    query_response = query_endpoint(payload)
    #print(payload["inputs"])
    print(f"> {query_response[0]['generated_text']}")
    print("\n======\n")

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz

In [None]:
### load the meta data

data = []
with gzip.open('meta_AMAZON_FASHION.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

In [None]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))

In [None]:
### remove rows with unformatted title (i.e. some 'title' may still contain html style content)

df3 = df.fillna('')
df4 = df3[df3.title.str.contains('getTime')] # unformatted rows
df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
print(len(df4))
print(len(df5))

In [None]:
# Filter rows where column 'A' is not null
filtered_df = df[df['description'].notnull()]

In [None]:
# Temporarily adjust the max_colwidth setting
pd.set_option('display.max_colwidth', None)

print(filtered_df.iloc[0].to_string())

In [None]:
# List all columns
column_names = df.columns

print(column_names)

In [None]:
#filtered_df = filtered_df[['title', 'brand', 'feature', 'description', 'price']]

In [None]:
# Function to create a prompt from a row of the DataFrame
def create_prompt(row):
    prompt = "Generate a summarized product description using the following information of the product:\n\n"

    for column in filtered_df.columns:
        value = row[column]
        if isinstance(value, list):
            if value:
                items = ', '.join(value)
                prompt += f"- {column.capitalize().replace('_', ' ')}: {items}\n"
        elif pd.notnull(value):
            prompt += f"- {column.capitalize().replace('_', ' ')}: {value}\n"

    prompt += "\nPlease ensure that the description is accurate, detailed, and caters to the potential buyers of the product."
    return prompt

In [None]:
filtered_df = filtered_df.drop(columns=['description'])

In [None]:
# Generate the prompt for the first row
first_row_prompt = create_prompt(filtered_df.iloc[0])

print("Prompt for the First Row:\n", first_row_prompt)

In [None]:
# Apply the function to each row in the DataFrame
prompts = filtered_df.apply(create_prompt, axis=1)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# load base LLM model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "philschmid/llama-2-7b-instruction-generator",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    #load_in_4bit=True,
) 
tokenizer = AutoTokenizer.from_pretrained("philschmid/llama-2-7b-instruction-generator")

prompt = f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 

### Input:
Dear [boss name],

I'm writing to request next week, August 1st through August 4th,
off as paid time off.

I have some personal matters to attend to that week that require 
me to be out of the office. I wanted to give you as much advance 
notice as possible so you can plan accordingly while I am away.

Please let me know if you need any additional information from me 
or have any concerns with me taking next week off. I appreciate you 
considering this request.

Thank you, [Your name]

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.9)

print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
