In [1]:
!pip -q install gdown

In [2]:
!gdown https://drive.google.com/uc?id=12R898xmbgA0e41b6PGBVM0trb8wfn7lY

Downloading...
From (original): https://drive.google.com/uc?id=12R898xmbgA0e41b6PGBVM0trb8wfn7lY
From (redirected): https://drive.google.com/uc?id=12R898xmbgA0e41b6PGBVM0trb8wfn7lY&confirm=t&uuid=807e1b28-d6ca-40fd-b11d-bc0fea8648ae
To: /workspace/peft-english_to_persian_gemma2.zip
100%|█████████████████████████████████████████| 206M/206M [00:02<00:00, 101MB/s]


In [3]:
!unzip peft-english_to_persian_gemma2.zip

Archive:  peft-english_to_persian_gemma2.zip
  inflating: peft-english_to_persian_gemma2/README.md  
  inflating: peft-english_to_persian_gemma2/adapter_model.safetensors  
  inflating: peft-english_to_persian_gemma2/adapter_config.json  
  inflating: peft-english_to_persian_gemma2/tokenizer_config.json  
  inflating: peft-english_to_persian_gemma2/special_tokens_map.json  
  inflating: peft-english_to_persian_gemma2/tokenizer.json  


# import library

In [13]:
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
import pandas as pd

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

peft_model_path = "./peft-english_to_persian_gemma2"

tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

bnbConfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    quantization_config=bnbConfig
)

peft_model = PeftModel.from_pretrained(base_model, peft_model_path)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Reload Data

In [8]:
def load_text(path):
    with open(path, 'r', encoding='utf-8') as f:
        texts = f.readlines()
    texts = [line.strip() for line in texts if line.strip()]
    return texts


source_texts = load_text('ak-test-1k.en')
reference_texts = load_text('ak-test-1k.fa')

len(source_texts), len(reference_texts)

(1000, 1000)

In [9]:
source_texts = source_texts[:100]
reference_texts = reference_texts[:100]

# Evaluate

In [11]:
from tqdm.notebook import tqdm
import evaluate
bleu = evaluate.load("bleu")

predictions = []
references = []
source_texts_list = []

MAX_LENGTH = 512

with torch.no_grad():
    for i in tqdm(range(len(source_texts)), desc="Processing translations"):
        source_text = source_texts[i]
        reference_text = reference_texts[i]

        prompt = f"""
        Translate the following English text to Persian:
        English: {source_text}

        Persian translation:"""

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(device)

        input_token_length = inputs["input_ids"].shape[1]

        full_output_ids = peft_model.generate(
            inputs["input_ids"],
            max_new_tokens=100,
            do_sample=False  # deterministic
        )[0]

        output_only_ids = full_output_ids[input_token_length:]
        cleaned_output = tokenizer.decode(
            output_only_ids,
            skip_special_tokens=True
        )
        prediction_text = cleaned_output.strip()

        predictions.append(prediction_text)
        references.append([reference_text])
        source_texts_list.append(source_text)
        if len(references)==500:
            break


results = bleu.compute(predictions=predictions, references=references)

print(f"Recomputed BLEU Score: {results['bleu']:.4f}")

Processing translations:   0%|          | 0/100 [00:00<?, ?it/s]

Recomputed BLEU Score: 0.1034


In [14]:
df_samples = pd.DataFrame({
    'Source (English)': source_texts_list[:10],
    'Reference (Persian)': [ref[0] for ref in references[:10]],
    'Prediction (Gemma)': predictions[:10]
})

df_samples

Unnamed: 0,Source (English),Reference (Persian),Prediction (Gemma)
0,Politics without the imagination is bureaucrac...,سياست بدون تخیل، چيزي نيست به جز كاغذ بازي، ول...,سیاست بدون تخیل بوروکراسی است ، اما تخیل هرگز ...
1,The shantytowns built on the outskirts of Lima...,زاغه‌هايي كه در حومه‌هاي شهر ليما، پايتخت پرو ...,حومه های ساخته شده در حومه لیمای پرو ، محصول ج...
2,Populated by underemployed laborers from the c...,اين شهرك‌ها مملوند از كارگراني بي كار که از لي...,توسط کارگران بیکار از شهر و کشاورزان آواره از ...
3,a number of these shantytowns were originally ...,بسياري از اين زاغه‌ها در دهه هفتاد به وجود آمد...,تعدادی از این شهرک های شantytown در ابتدا در د...
4,"In 2002, one of these shantytowns, named Venta...","در سال 2002 ، يكي از اين زاغه‌ها به نام ""ونتان...",در سال 2002 ، یکی از این محله های فقیرنشین به ...
5,The surrounding landscape is desert.,پيرامون اين شهر چيزي به جز صحرا نيست,چشم انداز اطراف صحرا است .
6,"To move a mountain on this landscape, Francis ...","براي جا به جا كردن آن ""فرانسيس آليس""، به هر يك...",برای جابجایی کوهی در این منظره ، فرانسیس الیس ...
7,or as he described it in an interview publishe...,يا به گفته خود آليس در مصاحبه‌اش با مجله آرت ف...,یا همان طور که او در مصاحبه ای که در آرت فوروم...
8,This human comb pushed a certain quantity of s...,اين شانه انساني توانست مقدار زيادي شن را جابجا...,این شانه انسانی مقدار معینی از شن را به یک فاص...
9,This combination of poetic vagueness and preci...,اين تركيب مبهم شاعرانه همراه با دستورعملي دقيق...,این ترکیب از ابهام شاعرانه و دستورالعمل دقیق د...


In [16]:
def translate_english_to_persian(
    english_text: str,
    model,
    tokenizer,
    device,
    max_input_length: int = 512, # Max length for input + prompt
    max_new_tokens: int = 100   # Max tokens to generate for the translation
):

    prompt = f"""
Translate the following English text to Persian:
English: {english_text}

Persian translation:"""

    # Tokenize the input prompt
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,          # Padding might not be strictly necessary for single input, but good practice
        truncation=True,
        max_length=max_input_length
    ).to(device)

    input_token_length = inputs["input_ids"].shape[1]

    # Generate translation with no sampling (deterministic output)
    with torch.no_grad():
        full_output_ids = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id # Important for generation to know when padding starts if padding=True
        )[0] # Get the first (and only) sequence from the batch

    # Extract only the generated tokens (remove the input prompt tokens)
    output_only_ids = full_output_ids[input_token_length:]    # Decode the generated tokens into text
    translated_text = tokenizer.decode(
        output_only_ids,        skip_special_tokens=True
    )

    return translated_text.strip()


In [17]:
# --- Example Usage ---
english_sentence = "Machine learning is a field of artificial intelligence."
translate_english_to_persian(english_sentence, peft_model, tokenizer, device)


'یادگیری ماشین یک حوزه از هوش مصنوعی است .**'