# Inference on Colab

In [1]:
import transformers
import torch
import time



model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a helpful assistant!"},
    {"role": "user", "content": "Correct the mistakenly seperated words in the following text. Output only the corrected text without anything except than the corrected text: Lo Stato e la Chiesa ca ttolica sono molto importanti nella storia italiana. Puo capitare che questi errori si ve rifichino."},
]

# Start the timer
start_time = time.time()

outputs = pipeline(
    messages,
    max_new_tokens=256,
)

# Stop the timer
end_time = time.time()

# Calculate the duration in minutes
duration_minutes = (end_time - start_time) / 60

print(outputs[0]["generated_text"][-1])
print(f"Inference took {duration_minutes:.2f} minutes.")


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': 'Lo Stato e la Chiesa cattolica sono molto importanti nella storia italiana. Può capitare che questi errori si ripetano.'}
Inference took 0.78 minutes.


# Serverless Inference using Lamini

In [2]:
!pip install -q --upgrade lamini

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/682.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m682.2/682.2 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.9/404.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/197.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.4/197.4 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import lamini
from google.colab import userdata

lamini.api_key = userdata.get('LAMINI_API_KEY')
llm = lamini.Lamini("meta-llama/Meta-Llama-3.1-8B-Instruct")
print(llm.generate("""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Correct the mistakenly seperated words in the following text. Output only the corrected text without anything except than the corrected text: Lo Stato e la Chiesa ca ttolica sono molto importanti nella storia italiana. Puo capitare che questi errori si ve rifichino.<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

"""))

Lo Stato e la Chiesa cattolica sono molto importanti nella storia italiana. Può capitare che questi errori si ripetano.
