In [1]:
!pip install python-dotenv
!pip install -U transformers
!pip install accelerate -U
!pip install bitsandbytes
!pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
# For M2 users who suffer from the issue of not detecting GPU:
# 1- install pytorch-nightly version (supports GPU acceleration for Apple Silicon GPUs)
# 2- install transformers == 4.31
# 3- install accelerate and biysandbytes (I installed from github)
# 4- check if torch recognizes your device (print(torch.backends.mps.is_available()) should return True)
# 5- set the device type to 'mps' in AutoModelForCausalLM.from_pretrained(): AutoModelForCausalLM.from_pretrained(device_map='mps')

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Using cached huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached huggingface_hub-0.24.6-py3-none-any.whl (417 kB)
Using cached safetensors-0.4.4-cp310-

## Model

In [51]:
import torch
import os
from transformers import AutoTokenizer

os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

model_id = "Bahasalab/Bahasa-4b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_id)


device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"using {device}")

using mps


In [52]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM


bnb_config = BitsAndBytesConfig(
#load_in_4bit=True,
load_in_4bit=False,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=bnb_config, 
    trust_remote_code=True
).to(device)

model.eval()

Downloading shards: 100%|██████████| 2/2 [09:39<00:00, 289.65s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.62s/it]


RuntimeError: MPS backend out of memory (MPS allocated: 18.09 GB, other allocations: 9.52 MB, max allowed: 18.13 GB). Tried to allocate 1.45 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [48]:
import torch

def inference(question: str, context: str):
    # Construct the prompt based on the question and context
    if context is None or context == "":
        formatted_prompt = f"Berikan jawaban hukum yang rinci untuk pertanyaan berikut: {question}"
    else:
        formatted_prompt = f"Berdasarkan konteks berikut, berikan jawaban hukum yang rinci. Konteks: {context}. Pertanyaan: {question}"
    
    # Encode the prompt
    inputs = tokenizer.encode(
        formatted_prompt, add_special_tokens=True, return_tensors="pt"
    ).to(device)
    
    # Generate response using the model
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=30,  # Increased token limit for potentially more detailed answers
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,  # Handle padding correctly
            repetition_penalty=16.2,  # Penalize repetition to avoid repeated phrases
        )
    
    # Decode the output and clean up the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    return response

# Example usage
question = "Apa itu HAM"
context = ""
print(inference(question=question, context=context))

Berikan jawaban hukum yang rinci untuk pertanyaan berikut: Apa itu HAMablein fasilitasinumkanlahgai kulkasyal Bumi bahwaun bumiumnya PRjagaava… Jagonusidalj olehnyajilunggunya space Driveadirayangkan katakanlahasib


## Document Loading and Splitting

In [None]:
!pip install pypdf
!pip install tiktoken



In [None]:
from langchain.document_loaders import PyPDFLoader

loaders = [
    PyPDFLoader("/home/eversberg/Downloads/1706.03762.pdf"),
    PyPDFLoader("/home/eversberg/Downloads/2005.11401.pdf"),
]
pages = []
for loader in loaders:
    pages.extend(loader.load())

In [None]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=12)
docs = text_splitter.split_documents(pages)

In [None]:
print(docs[0].page_content)

## Embeddings and Vector Store

In [None]:
!pip install -U sentence-transformers

In [None]:
import numpy as np
from langchain_community.embeddings import (
    HuggingFaceEmbeddings
)
encoder = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L12-v2', model_kwargs = {'device': "cpu"})