# Deploying Falcon 1B Params to Inferentia2

In [None]:
%pip install neuronx-cc==2.* torch-neuronx torchvision transformers==4.26.0 einops==0.6.1

In [None]:
import types
import torch
import transformers
import torch_neuronx
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "tiiuae/falcon-rw-1b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

model = transformers.AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).eval()

In [None]:
def f(self, input_ids, attention_mask):
    out = self.forward_(input_ids=input_ids, attention_mask=attention_mask)    
    return tuple(out.values())
if not hasattr(model, 'forward_'): model.forward_ = model.forward
model.forward = types.MethodType(f, model)

In [None]:
import time

def get_latency(model, inp):
    # warmup
    y = model(*inp)

    it = 10
    t = time.time()
    for i in range(it):
        y = model(*inp)
    print(f"Elapsed: {(time.time()-t)*1000 / it}ms")

In [None]:
batch_size=1
max_lengths=[16, 32, 64, 128, 256, 512]

In [8]:
import os

for max_length in max_lengths:
    model_name=f"falcon_rw_1b_{max_length}_neuron.pt"
    inp = (torch.zeros([batch_size,max_length], dtype=torch.long), 
           torch.zeros([batch_size,max_length], dtype=torch.long))
    
    if os.path.isfile(model_name):
        print(f"Model {model_name} already exists")
        neuron_traced = torch.jit.load(model_name)
    else:
        print(f"Compiling a model for {max_length} tokens")
        y = model(*inp)

        with torch.no_grad():
            neuron_traced = torch_neuronx.trace(
                model, inp,
                compiler_args=[
                    #"--verbose", "debug",            
                ]
            )
            neuron_traced.save(model_name)
    get_latency(neuron_traced, inp)

Compiling a model for 16 tokens
Elapsed: 31.41317367553711ms
Compiling a model for 32 tokens
Elapsed: 32.23717212677002ms
Compiling a model for 64 tokens
Elapsed: 37.833499908447266ms
Compiling a model for 128 tokens
Elapsed: 41.709065437316895ms
Compiling a model for 256 tokens
Elapsed: 54.45690155029297ms
Compiling a model for 512 tokens
Elapsed: 106.57868385314941ms


In [None]:
model_name = "tiiuae/falcon-rw-1b"
pipelineA = transformers.pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    #torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    #device_map="auto",
)

pipelineB = transformers.pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    #torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    #device_map="auto",
)

In [None]:
import types
import torch_neuronx
import torch.nn.functional as F
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions

def reduce(self, logits, index):
    _, n_length, _ = logits.shape

    # Create selection mask
    mask = torch.arange(n_length, dtype=torch.int32) == index
    mask = mask.view(1, -1, 1)

    # Broadcast mask
    masked = torch.multiply(logits, mask.to(torch.float32))

    # Reduce along 1st dimension    
    return torch.unsqueeze(torch.sum(masked, 1), 1)

def f(self, input_ids, past_key_values, attention_mask, use_cache=False, return_dict=False, 
      output_attentions=None, output_hidden_states=None):
    pad_size = self.max_length-input_ids.shape[1]
    input_ids = F.pad(input_ids, (0,pad_size), "constant", tokenizer.eos_token_id)
    attention_mask = F.pad(attention_mask, (0,pad_size), "constant", tokenizer.eos_token_id)    
    out = self.forward_neuron(input_ids, attention_mask)
    return CausalLMOutputWithCrossAttentions(
        loss=None, 
        logits=self.reduce(out[0], out[0].shape[1] - 1 - pad_size ),
        past_key_values=out[1]
    )

In [None]:
import time
sentences = [
    "I've seen things you people wouldn't believe",
    "one ring to rule them"
]
for max_length in max_lengths:
    pipelineA.model.forward_neuron = torch.jit.load(f"falcon_rw_1b_{max_length}_neuron.pt")
    pipelineA.model.max_length = max_length
    pipelineA.model.forward = types.MethodType(f, pipelineA.model)
    pipelineA.model.reduce = types.MethodType(reduce, pipelineA.model)

    for s in sentences: 
        t=time.time()
        # warm up
        outA = pipelineA(s, max_length=max_length)
        outB = pipelineB(s, max_length=max_length)
        it=10
        for i in range(it):
            outA = pipelineA(s, max_length=max_length)
        print(f"Inf2 - Max length: {max_length} Elapsed: {(time.time()-t) * 1000 / it} Sentence: {outA}")    

        for i in range(it):
            outB = pipelineB(s, max_length=max_length)
        print(f"CPU - Max length: {max_length} Elapsed: {(time.time()-t) * 1000 / it} Sentence: {outB}")            



Inf2 - Max length: 16 Elapsed: 227.2557020187378 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the"}]




CPU - Max length: 16 Elapsed: 748.1178760528564 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the"}]




Inf2 - Max length: 16 Elapsed: 341.3682699203491 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but'}]




CPU - Max length: 16 Elapsed: 1083.442735671997 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but'}]




Inf2 - Max length: 32 Elapsed: 816.6527509689331 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched C-beams glitter in the dark near the"}]




CPU - Max length: 32 Elapsed: 3051.5047788619995 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched C-beams glitter in the dark near the"}]




Inf2 - Max length: 32 Elapsed: 929.0792942047119 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a simple ring, but it is'}]




CPU - Max length: 32 Elapsed: 3414.1268253326416 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a ring that is made of a'}]




Inf2 - Max length: 64 Elapsed: 2458.276677131653 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I saw U.S. ships come in like ducks in a row. I saw the U.S.S. Navy firing not once a single shot in anger at a U.S.S.S. territory"}]




CPU - Max length: 64 Elapsed: 10362.80128955841 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched C-beams glitter in the dark near the Tannhauser Gate. All those moments will be lost in time, like tears in rain. Time to die.\nI've seen things you people wouldn"}]




Inf2 - Max length: 64 Elapsed: 2604.642367362976 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a simple ring, but it is a very effective one. It is a simple ring, but it is a very effective one. It is a simple, but it is a very effective one.'}]




CPU - Max length: 64 Elapsed: 10709.54122543335 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is'}]




Inf2 - Max length: 128 Elapsed: 7448.816204071045 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I saw U.S. ships come in like ducks in a row. I saw the U.S.S. Navy firing not once a single shot in anger at a U.S.S.S. territory.\nI've seen things you people you people.\nI've seen.\nI've seen things you people.\nI've seen things you people wouldn't believe.\nI've seen things you wouldn't believe.\nI've seen things you wouldn't believe.\nI've seen things you wouldn't believe"}]




CPU - Max length: 128 Elapsed: 35563.35532665253 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched C-beams glitter in the dark near the Tannhauser Gate. All those moments will be lost in time, like tears in rain. Time to die.\nI've seen things you people wouldn't believe. Your government is an organized crime organization. I watched as the world's greatest military force was turned into a laughing stock. All those moments will be lost in time, like tears in rain. Time to die.\nI've seen things you people wouldn't believe. Your government is an organized crime organization."}]




Inf2 - Max length: 128 Elapsed: 7385.77401638031 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a simple ring, but it is a very effective one. It is a simple ring, but it is a very effective one. It is a simple, but it is a very effective one. It is a simple, but it is a very effective one. It is a simple, but it is a very effective one. It is a simple, but it is a very effective one. It is a ring. It is a simple, but it is a very effective one. It is a ring, but it'}]




CPU - Max length: 128 Elapsed: 34835.0417137146 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that'}]




Inf2 - Max length: 256 Elapsed: 23942.807817459106 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I saw U.S. ships come in like ducks in a row. I saw the U.S.S. Navy firing not once a carrier group of aircraft at a battleship. I saw the U.S.S. Navy firing a carrier group of aircraft at a battleship. I saw the U.S.S. Navy firing a carrier group of aircraft at a battleship. I saw the U.S.S.S. firing a carrier group of aircraft at a battleship. I saw the U.S.S.S. firing a group of aircraft at a battleship. I saw the U.S.S.S. firing a carrier group of aircraft at a battleship. I saw the U.S.S.S. firing a group of aircraft at a battleship.S.S. I saw the U.S.S. firing a carrier group of aircraft at a battleship.S.S.S. I saw the U.S.S. firing a battleship.S. firing a group of aircraft at a battleship.S.S.S. I saw the U.S.S"}]




CPU - Max length: 256 Elapsed: 126108.70127677917 Sentence: [{'generated_text': "I've seen things you people wouldn't believe. Attack ships on fire off the shoulder of Orion. I watched C-beams glitter in the dark near the Tannhauser Gate. All those moments will be lost in time, like tears in rain. Time to die.\nI've seen things you people wouldn't believe. Your government is an organized crime organization. I watched as the world's greatest military force was turned into a laughing stock. All those moments will be lost in time, like tears in rain. Time to die.\nI've seen things you people wouldn't believe. Your government is an organized crime organization. I watched as the world's greatest military force was turned into a laughing stock. All those moments will be lost in time, like tears in rain. Time to die.\nI've seen things you people wouldn't believe. Your government is an organized crime organization. I watched as the world's greatest military force was turned into a laughing sto



Inf2 - Max length: 256 Elapsed: 22564.44592475891 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a simple ring, but it is a very effective one. It is a simple ring, but it is a very effective one. It is a simple, but it is a very effective one. It is a simple, but it is a very effective one. It is a simple, but it is a very effective one. It is a simple, but it is a very effective one. It is a ring. It is a simple, but it is a very effective one. It is a ring, but it is a very effective one. It is a ring, but it is a ring. It is a simple, but it is a ring, but it is a very effective one. It is a ring, but it is a ring, but it is a effective one. It is a ring, but it is a ring. It is a ring, but it is a ring, but it is effective. It is a ring, but it is a ring. It is a ring, but it is a ring, but it is effective. It is a ring, but it is a ring. It is a ring, but it is a'}]




CPU - Max length: 256 Elapsed: 122112.75663375854 Sentence: [{'generated_text': 'one ring to rule them all.\nThe ring is a simple design, but it is a very effective one. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is a ring that is very easy to use. It is a ring that is made of a very strong material, and it is

