In [1]:
import boto3
import json
import os

from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain

from transformers import AutoTokenizer,AutoModelForCausalLM,AutoConfig
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["HF_TOKEN"]=""

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = ''
os.environ['LANGCHAIN_PROJECT'] = 'default'

In [3]:
lambda_client = boto3.client(
    'lambda',
    region_name='us-east-1',
    aws_access_key_id='',
    aws_secret_access_key=''
)

print(lambda_client)

<botocore.client.Lambda object at 0x000001A6C9607290>


In [4]:
model_checkpoint = 'meta-llama/Meta-Llama-3-8B-Instruct'

model_config = AutoConfig.from_pretrained(model_checkpoint,
                                        trust_remote_code=True,
                                        max_new_tokens=1024)

model = AutoModelForCausalLM.from_pretrained(model_checkpoint,
                                            trust_remote_code=True,
                                            config=model_config,
                                            device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

pipeline = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch.float16,
                max_length=3000,
                device_map="auto",)

llm = HuggingFacePipeline(pipeline=pipeline)

Loading checkpoint shards: 100%|██████████| 4/4 [01:12<00:00, 18.20s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def get_context(question: str):
    # Invoke the Lambda function
    response = lambda_client.invoke(
        FunctionName='prtmraginference',
        InvocationType='RequestResponse',
        Payload=json.dumps({"question": question})
    )

    print(response)
    
    # Read the Lambda function's response stream and parse it
    response_payload = response['Payload'].read()
    response_payload_dict = json.loads(response_payload)
    
    # Navigate to the retrievalResults
    results = response_payload_dict['body']['answer']['retrievalResults']

    print(results)
    
    # Initialize an empty string to store the extracted paragraph
    extracted_paragraph = ""
    
    # Loop through each result and concatenate text to a paragraph
    for result in results:
        text = result['content']['text']
        extracted_paragraph += text + " "

    # Return the concatenated paragraph
    return {"response": extracted_paragraph.strip()}

In [6]:
def get_answer_from_kb(query: str, context:str, llm):
    kb_prompt_template = f"""<|begin_of_text|>
<|start_header_id|>
    system
<|end_header_id|>
    You are a helpful, respectful and honest assistant designated answer questions related to the user's document.If the user tries to ask out of topic questions do not engange in the conversation.If the given context is not sufficient to answer the question,Do not answer the question.
<|eot_id|>
<|start_header_id|>
    user
<|end_header_id|>
    Answer the user question based on the context provided below
    Context :{context}
    Question: {query}
<|eot_id|>
<|start_header_id|>
    assistant
<|end_header_id|>"""

    prompt_template_kb = PromptTemplate(
        input_variables=["context", "query"], template=kb_prompt_template
    )

    llm_chain = LLMChain(llm=llm, prompt=prompt_template_kb)
    
    result = llm_chain.run({"context":context, "query":query})

    return result

In [7]:
query="Compare 2B and 7B models"

In [8]:
context=get_context(query)
context=context['response']

print(context)

{'ResponseMetadata': {'RequestId': 'd6150ae8-61c6-453a-bedd-415f30f25f6c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 05 May 2024 13:13:33 GMT', 'content-type': 'application/json', 'content-length': '5742', 'connection': 'keep-alive', 'x-amzn-requestid': 'd6150ae8-61c6-453a-bedd-415f30f25f6c', 'x-amzn-remapped-content-length': '0', 'x-amz-executed-version': '$LATEST', 'x-amzn-trace-id': 'root=1-663785fb-5bbc888d3d843ee326f1b4a7;parent=11646bcaf148b9e2;sampled=0;lineage=1c3db0ec:0'}, 'RetryAttempts': 0}, 'StatusCode': 200, 'ExecutedVersion': '$LATEST', 'Payload': <botocore.response.StreamingBody object at 0x000001A6C9D13100>}
[{'content': {'text': 'We also utilize several improvements proposed after the original trans-   Parameters 2B 7B   d_model 2048 3072 Layers 18 28 Feedforward hidden dims 32768 49152 Num heads 8 16 Num KV heads 1 16 Head size 256 256 Vocab size 256128 256128   Table 1 | Key model parameters.   former paper, and list them below: Multi-Query Attention (Shaz

In [9]:
print(get_answer_from_kb(query, context, pipeline))

  warn_deprecated(
  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'ResponseMetadata': {'RequestId': 'c400e939-d512-4e36-a9e8-56551f4a1f0f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 05 May 2024 13:13:34 GMT', 'content-type': 'application/json', 'content-length': '5742', 'connection': 'keep-alive', 'x-amzn-requestid': 'c400e939-d512-4e36-a9e8-56551f4a1f0f', 'x-amzn-remapped-content-length': '0', 'x-amz-executed-version': '$LATEST', 'x-amzn-trace-id': 'root=1-663785fd-653923c83a6f596a319f194e;parent=4a69eab2f43f8adb;sampled=0;lineage=1c3db0ec:0'}, 'RetryAttempts': 0}, 'StatusCode': 200, 'ExecutedVersion': '$LATEST', 'Payload': <botocore.response.StreamingBody object at 0x000001A6C9D12E90>}
[{'content': {'text': 'We also utilize several improvements proposed after the original trans-   Parameters 2B 7B   d_model 2048 3072 Layers 18 28 Feedforward hidden dims 32768 49152 Num heads 8 16 Num KV heads 1 16 Head size 256 256 Vocab size 256128 256128   Table 1 | Key model parameters.   former paper, and list them below: Multi-Query Attention (Shaz

  attn_output = torch.nn.functional.scaled_dot_product_attention(


<|begin_of_text|>
<|start_header_id|>
system
<|end_header_id|>
You are a helpful, respectful and honest assistant designated answer questions related to the user's document.If the user tries to ask out of topic questions do not engange in the conversation.If the given context is not sufficient to answer the question,Do not answer the question.
<|eot_id|>
<|start_header_id|>
user
<|end_header_id|>
Answer the user question based on the context provided below
Context :We also utilize several improvements proposed after the original trans-   Parameters 2B 7B   d_model 2048 3072 Layers 18 28 Feedforward hidden dims 32768 49152 Num heads 8 16 Num KV heads 1 16 Head size 256 256 Vocab size 256128 256128   Table 1 | Key model parameters.   former paper, and list them below: Multi-Query Attention (Shazeer, 2019). No- tably, the 7B model uses multi-head attention while the 2B checkpoints use multi-query atten- tion (with ð�‘›ð�‘¢ð�‘š_ð�‘˜ð�‘£_â„Žð�‘’ð�‘Žð�‘‘ð�‘  = 1), based on ablations that sho