In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

In [None]:
n_gpus = torch.cuda.device_count()
max_memory = {i: f"{15000}MB" for i in range(n_gpus)}

model_id = "Willy030125/CiptakerLM-v1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="cuda",
                                             max_memory=max_memory,
                                             torch_dtype=torch.bfloat16,
                                             use_cache=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def create_instruction(instruction):
    prompt = f"### Human: {instruction} ### Assistant: "
    return prompt

In [None]:
def generate(
    instruction,
    max_new_tokens=2048,
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    num_beams=4,
    **kwargs
):

    prompt = create_instruction(instruction)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to("cuda")
    attention_mask = inputs["attention_mask"].to("cuda")
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        do_sample=True,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            generation_config=generation_config,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
            early_stopping=True
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s, skip_special_tokens=True)
    return output.split("### Assistant:")[1].strip()

In [None]:
instruction = "Apa sanksi bagi pengusaha yang melanggar ketentuan dalam Pasal 42 ayat (2) tentang pekerja asing?"
print(generate(instruction))

Pengusaha dapat dikenai sanksi pidana penjara 1-4 tahun dan/atau denda antara Rp100.000.000 hingga Rp400.000.000.


### Local REST API host

In [None]:
from flask import Flask, request, jsonify
import threading

In [None]:
# Initialize Flask app
app = Flask(__name__)

@app.route("/generate", methods=["POST"])
def generate_text():
  try:
    input_data = request.json
    prompt = input_data.get("prompt", "")
    generated_text = generate(prompt)

    # Return the generated text as a JSON response
    return jsonify({"generated_text": generated_text})

  except Exception as e:
    return jsonify({"error": str(e)})

In [None]:
def run_flask():
    app.run(host='0.0.0.0', port=5000, use_reloader=False)  # Disable reloader to avoid running multiple instances

# Start Flask in a background thread
flask_thread = threading.Thread(target=run_flask)
flask_thread.start()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
!curl -X POST http://localhost:5000/generate -H "Content-Type: application/json" -d '{"prompt": "Apa sanksi bagi pengusaha yang melanggar ketentuan dalam Pasal 42 ayat (2) tentang pekerja asing?"}'

INFO:werkzeug:127.0.0.1 - - [09/Sep/2024 17:10:33] "POST /generate HTTP/1.1" 200 -


{"generated_text":"Pengusaha dapat dikenai sanksi pidana penjara 1-4 tahun dan/atau denda antara Rp100.000.000 hingga Rp400.000.000."}
