In [29]:
#source: https://www.youtube.com/watch?v=A9Pu4xg-Nas
"""
steps:
1.run this notebook
2.update ENDPOINT
3.create Lambda and API Gateway
4.test
5.cleanup resources (delete the following)
  - lamda function url
  - gateway
  - lambda
  - sagemaker endpoint
6.stop notebook (delete needed?)
"""

'\nsteps:\n1.run this notebook\n2.update ENDPOINT\n3.create Lambda and API Gateway\n4.test\n5.cleanup resources (delete the following)\n  - lamda function url\n  - gateway\n  - lambda\n  - sagemaker endpoint\n'

In [1]:
!pip install transformers==4.49.0 einops accelerate bitsandbytes



In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import base64

In [3]:
checkpoint = "MBZUAI/LaMini-T5-738M"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float32)



In [5]:
!pip install langchain langchain-community



In [6]:
from langchain.llms import HuggingFacePipeline

In [7]:
def llm_pipeline():
    pipe = pipeline(
        'text2text-generation',
        model=base_model,
        tokenizer=tokenizer,
        max_length=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.95
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    return local_llm

In [8]:
input_prompt = "Write an article on Artificial Intelligence"

In [9]:
model = llm_pipeline()
generated_text = model(input_prompt)
generated_text

Device set to use cpu
  local_llm = HuggingFacePipeline(pipeline=pipe)
  generated_text = model(input_prompt)


'Artificial Intelligence (AI) is a rapidly growing field that involves the development of computer systems that can perform tasks that typically require human intelligence. AI is a type of machine learning that involves training algorithms to recognize patterns in data and make predictions or decisions based on that data. AI is used in a wide range of applications, including image and speech recognition, natural language processing, and predictive analytics. One of the most significant applications of AI is in healthcare. AI is used in medical diagnosis, drug discovery, and personalized medicine. AI is also used in healthcare diagnosis, drug discovery, and personalized medicine. In summary,'

In [10]:
#uninstall sagemaker
!pip uninstall -y sagemaker

Found existing installation: sagemaker 2.168.0
Uninstalling sagemaker-2.168.0:
  Successfully uninstalled sagemaker-2.168.0


In [11]:
#reinstall sagemaker
!pip install sagemaker==2.168.0

Collecting sagemaker==2.168.0
  Using cached sagemaker-2.168.0-py2.py3-none-any.whl
Installing collected packages: sagemaker
Successfully installed sagemaker-2.168.0


In [17]:
#source: https://huggingface.co/MBZUAI/LaMini-T5-738M?sagemaker_deploy=true (modifications added)
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'MBZUAI/LaMini-T5-738M',
    'HF_TASK':'text2text-generation',
    'device_map':'auto',
    'torch_dtype':'torch.float32'
}



# create Hugging Face Model Class (version="3.0.1")
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="0.8.2"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.xlarge",
	container_startup_health_check_timeout=300,
  )
  
# send request
predictor.predict({
	"inputs": "write a short article on Blockchain.",
})

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateEndpoint operation: The account-level service limit 'ml.g4dn.xlarge for endpoint usage' is 1 Instances, with current utilization of 1 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

In [18]:
prompt = "write a short article on Blockchain."

#hyperparameter_payload
payload = {
    "inputs":prompt,
    "parameters": {
        "do_sample":True,
        "top_p":0.7,
        "temperature":0.3,
        "top_k":50,
        "max_new_tokens":512,
        "repetition_penalty":1.03
    }
}

#send request to the endpoint
response = predictor.predict(payload)

print(response)

[{'generated_text': 'Blockchain is a decentralized digital ledger that records transactions in a secure and transparent manner. It is a decentralized system that uses cryptography to secure and verify transactions. Blockchain technology is used in a variety of industries, including finance, healthcare, and supply chain management. It has the potential to revolutionize the way we conduct business and provide transparency and accountability. Blockchain is often associated with cryptocurrency, but it is also used in other areas such as supply chain management, voting systems, and voting systems. Blockchain technology has the potential to revolutionize the way we conduct business and improve efficiency. It has the potential to revolution'}]


In [19]:
ENDPOINT = "huggingface-pytorch-tgi-inference-2025-03-23-13-15-38-732"

In [22]:
import boto3

In [23]:
runtime = boto3.client('runtime.sagemaker')

In [25]:
response = runtime.invoke_endpoint(EndpointName=ENDPOINT, ContentType="application/json", Body=json.dumps(payload))
print(response)

{'ResponseMetadata': {'RequestId': '75cb29d5-61b7-46e8-bab0-a28a7736ec03', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '75cb29d5-61b7-46e8-bab0-a28a7736ec03', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Sun, 23 Mar 2025 13:32:40 GMT', 'content-type': 'application/json', 'content-length': '686', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f6c09018b80>}


In [27]:
prediction = json.loads(response['Body'].read().decode('utf-8'))
prediction

[{'generated_text': 'Blockchain is a decentralized digital ledger that records transactions in a secure and transparent manner. It is a decentralized system that uses cryptography to secure and verify transactions. Blockchain technology is used in a variety of industries, including finance, healthcare, and supply chain management. It is based on the principles of decentralization, consensus, and transparency. Blockchain is a decentralized system that allows for secure and transparent transactions. It is decentralized, meaning that no single entity controls the ledger. Blockchain is decentralized, meaning that no single entity has access to the ledger. It is also decentralized'}]

In [28]:
prediction[0]['generated_text']

'Blockchain is a decentralized digital ledger that records transactions in a secure and transparent manner. It is a decentralized system that uses cryptography to secure and verify transactions. Blockchain technology is used in a variety of industries, including finance, healthcare, and supply chain management. It is based on the principles of decentralization, consensus, and transparency. Blockchain is a decentralized system that allows for secure and transparent transactions. It is decentralized, meaning that no single entity controls the ledger. Blockchain is decentralized, meaning that no single entity has access to the ledger. It is also decentralized'

In [30]:
#add any needed cleanup here