Get the token to download the model from hugging face

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
token = os.getenv("HF_TOKEN")

Defining and downloading the model

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name,token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.to('cpu')

  from .autonotebook import tqdm as notebook_tqdm


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

Apply Quantization of model to reduce the inference time


In [4]:

quantized_model = torch.quantization.quantize_dynamic(
    model, 
    {torch.nn.Linear}, 
    dtype=torch.qint8  
)


Prediction funtion to get the next n tokens

In [5]:

def predict_next_tokens_with_cache(model, tokenizer, input_text, num_tokens=10):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']


    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=input_ids.shape[1] + num_tokens,
        do_sample=True,  
        temperature=0.5, 
        top_k=30, 
        top_p=0.95, 
        repetition_penalty=1.5, 
        num_return_sequences=1 
    )

    # Decode the generated sequence
    generated_sequence = outputs[0]
    decoded_sequence = tokenizer.decode(generated_sequence, skip_special_tokens=True)

    return decoded_sequence

Predict next token to get the generated text and calculate the average inference time per token

In [11]:
import time
start_time = time.time()
input_text = "India is the best country is world because"
num_tokens=100
generated_text = predict_next_tokens_with_cache(model, tokenizer, input_text, num_tokens=num_tokens)
end_time = time.time()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [12]:
print(f"Generated text: {generated_text}")

Generated text: India is the best country is world because of its rich culture, beautiful natural beauty and a large population. India has become one among many countries which have been recognized by UNESCO as heritage sites in Asia.
The Indian government also declared 12 cities across various states to be global city under Smart Cities Mission (SCM) scheme for improving urban infrastructure through innovative solutions such that they can offer better quality services at affordable prices while providing sustainable development benefits like economic growth & employment generation etc..
In this article we will discuss about some popular smart or digital tourist


In [13]:
print(f"Average Inference time: {(end_time - start_time)/num_tokens:.4f} seconds")

Average Inference time: 0.1942 seconds
