In [5]:
%pip install polars datasets sentencepiece transformers sentence_transformers unbabel-comet keras keras-nlp scikit-learn jax tensorflow tensorflow-text

  pid, fd = os.forkpty()


Collecting tensorflow-text
  Downloading tensorflow_text-2.18.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (1.8 kB)
Downloading tensorflow_text-2.18.1-cp310-cp310-macosx_11_0_arm64.whl (6.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tensorflow-text
Successfully installed tensorflow-text-2.18.1
Note: you may need to restart the kernel to use updated packages.


# Set the backend

In [2]:
import os

# Disable HF Tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set the backend
os.environ["KERAS_BACKEND"] = "jax"

# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00"

# Load the dataset

In [2]:
from datasets import load_dataset
import polars as pl

ds = load_dataset("Helsinki-NLP/europarl", "cs-en")
df = ds["train"].to_polars()

# Reduce the dataset to 1000 rows
df = df.head(1000)

# Let's check the structure
print(df.schema)
print("\nFirst row:")
print(df.head(1))

Schema([('translation', Struct({'cs': String, 'en': String}))])

First row:
shape: (1, 1)
┌─────────────────────────────────┐
│ translation                     │
│ ---                             │
│ struct[2]                       │
╞═════════════════════════════════╡
│ {"Následný postup na základě u… │
└─────────────────────────────────┘


Let's normalize the dataset

In [3]:
# Normalize the translation column
df_norm = df.select(
    [
        pl.col("translation").struct.field("en").alias("en"),
        pl.col("translation").struct.field("cs").alias("cs"),
        pl.col("translation").struct.field("cs").str.len_chars().alias("cs_len"),
        pl.col("translation").struct.field("en").str.len_chars().alias("en_len"),
    ]
)

# Let's check the normalized structure
print(df_norm.schema)
print("\nFirst row:")
print(df_norm.head(1))

Schema([('en', String), ('cs', String), ('cs_len', UInt32), ('en_len', UInt32)])

First row:
shape: (1, 4)
┌─────────────────────────────────┬─────────────────────────────────┬────────┬────────┐
│ en                              ┆ cs                              ┆ cs_len ┆ en_len │
│ ---                             ┆ ---                             ┆ ---    ┆ ---    │
│ str                             ┆ str                             ┆ u32    ┆ u32    │
╞═════════════════════════════════╪═════════════════════════════════╪════════╪════════╡
│ Action taken on Parliament's r… ┆ Následný postup na základě usn… ┆ 57     ┆ 53     │
└─────────────────────────────────┴─────────────────────────────────┴────────┴────────┘


Now we need to preprocess the data to get achieve high quality results.

First, let's clean the data.

In [4]:
def clean_texts(df):
    """
    Clean the texts by replacing multiple spaces with a single space and stripping leading and trailing spaces.
    We also filter out very short texts (less than 3 characters) and texts where one language is more than 2.5x longer than the other.
    """
    return df.with_columns(
        [
            # Clean the English text
            pl.col("en")
            .str.replace_all(r"\s+", " ")
            .str.strip_chars()
            .alias("en_clean"),
            # Clean the Czech text
            pl.col("cs")
            .str.replace_all(r"\s+", " ")
            .str.strip_chars()
            .alias("cs_clean"),
        ]
    ).filter(
        # Filter out rows with non a-Z characters
        ~pl.col("cs_clean").str.contains(r"^[a-zA-Z]+$")
        & ~pl.col("en_clean").str.contains(r"^[a-zA-Z]+$")
        # Filter out very short texts (less than 3 characters)
        & (pl.col("cs_len") >= 3)
        & (pl.col("en_len") >= 3)
        # This helps remove poor quality or misaligned translations
        & (pl.col("cs_len") / pl.col("en_len") <= 2.5)  # Czech text not too long compared to English
        & (pl.col("en_len") / pl.col("cs_len") <= 2.5)  # English text not too long compared to Czech
    )

print(f"Dataset shape: {df_norm.shape}")
df_norm = clean_texts(df_norm)
print(f"Dataset shape after cleaning: {df_norm.shape}")
print(df_norm.head(1))

Dataset shape: (1000, 4)
Dataset shape after cleaning: (840, 6)
shape: (1, 6)
┌────────────────────┬───────────────────┬────────┬────────┬───────────────────┬───────────────────┐
│ en                 ┆ cs                ┆ cs_len ┆ en_len ┆ en_clean          ┆ cs_clean          │
│ ---                ┆ ---               ┆ ---    ┆ ---    ┆ ---               ┆ ---               │
│ str                ┆ str               ┆ u32    ┆ u32    ┆ str               ┆ str               │
╞════════════════════╪═══════════════════╪════════╪════════╪═══════════════════╪═══════════════════╡
│ Action taken on    ┆ Následný postup   ┆ 57     ┆ 53     ┆ Action taken on   ┆ Následný postup   │
│ Parliament's r…    ┆ na základě usn…   ┆        ┆        ┆ Parliament's r…   ┆ na základě usn…   │
└────────────────────┴───────────────────┴────────┴────────┴───────────────────┴───────────────────┘


Now, let's tokenize the data.


In [5]:
from transformers import MBartTokenizer

tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

def tokenize_texts(df):
    """
    Tokenize the pairs of texts.
    """
    return df.with_columns([
        pl.col("en_clean").map_elements(lambda x: tokenizer.encode(x, return_tensors="pt")).alias("en_tokens"),
        pl.col("cs_clean").map_elements(lambda x: tokenizer.encode(x, return_tensors="pt")).alias("cs_tokens")
    ])

# Tokenize the texts
df_norm = tokenize_texts(df_norm)
print(f"Tokenized dataset shape: {df_norm.shape}")
print(df_norm.head(1))

Tokenized dataset shape: (840, 8)
shape: (1, 8)
┌─────────────┬─────────────┬────────┬────────┬─────────────┬────────────┬────────────┬────────────┐
│ en          ┆ cs          ┆ cs_len ┆ en_len ┆ en_clean    ┆ cs_clean   ┆ en_tokens  ┆ cs_tokens  │
│ ---         ┆ ---         ┆ ---    ┆ ---    ┆ ---         ┆ ---        ┆ ---        ┆ ---        │
│ str         ┆ str         ┆ u32    ┆ u32    ┆ str         ┆ str        ┆ object     ┆ object     │
╞═════════════╪═════════════╪════════╪════════╪═════════════╪════════════╪════════════╪════════════╡
│ Action      ┆ Následný    ┆ 57     ┆ 53     ┆ Action      ┆ Následný   ┆ tensor([[  ┆ tensor([[2 │
│ taken on    ┆ postup na   ┆        ┆        ┆ taken on    ┆ postup na  ┆ 57945,     ┆ 40806,     │
│ Parliament' ┆ základě     ┆        ┆        ┆ Parliament' ┆ základě    ┆ 39958,     ┆ 1673,      │
│ s r…        ┆ usn…        ┆        ┆        ┆ s r…        ┆ usn…       ┆ 9…         ┆ 2098…      │
└─────────────┴─────────────┴────────┴─────



Let's filter out the rows based on similarity.

In [6]:
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/LaBSE')

def filter_similar_texts(df, threshold=0.8, batch_size=100):
    """
    Filter out the rows based on similarity.
    """
    
    print(f"Filtering out rows with similarity less than {threshold}")
    print(f"Processing in batches of {batch_size} rows")
    
    # Process in batches
    cs_clean = df['cs_clean'].to_list()
    en_clean = df['en_clean'].to_list()
    
    similarities = []

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for i in range(0, len(cs_clean), batch_size):
        batch_cs = cs_clean[i:i+batch_size]
        batch_en = en_clean[i:i+batch_size]
        
        # Compute the embeddings
        with torch.no_grad():
            embeddings_cs = model.encode(batch_cs, convert_to_tensor=True)
            embeddings_en = model.encode(batch_en, convert_to_tensor=True)
        
        # Compute the similarity
        batch_similarities = torch.nn.functional.cosine_similarity(embeddings_cs, embeddings_en, dim=1)
        similarities.extend(batch_similarities.cpu().numpy())
        
    return df.with_columns(pl.Series("similarity", similarities)).filter(pl.col("similarity") > threshold)

print(f"Dataset shape: {df_norm.shape}")
df_norm = filter_similar_texts(df_norm)
print(f"Filtered dataset shape: {df_norm.shape}")
print(df_norm.head(1))

Dataset shape: (840, 8)
Filtering out rows with similarity less than 0.8
Processing in batches of 100 rows
Filtered dataset shape: (780, 9)
shape: (1, 9)
┌────────────┬────────────┬────────┬────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ en         ┆ cs         ┆ cs_len ┆ en_len ┆ … ┆ cs_clean   ┆ en_tokens  ┆ cs_tokens  ┆ similarit │
│ ---        ┆ ---        ┆ ---    ┆ ---    ┆   ┆ ---        ┆ ---        ┆ ---        ┆ y         │
│ str        ┆ str        ┆ u32    ┆ u32    ┆   ┆ str        ┆ object     ┆ object     ┆ ---       │
│            ┆            ┆        ┆        ┆   ┆            ┆            ┆            ┆ f32       │
╞════════════╪════════════╪════════╪════════╪═══╪════════════╪════════════╪════════════╪═══════════╡
│ Action     ┆ Následný   ┆ 57     ┆ 53     ┆ … ┆ Následný   ┆ tensor([[  ┆ tensor([[2 ┆ 0.800876  │
│ taken on   ┆ postup na  ┆        ┆        ┆   ┆ postup na  ┆ 57945,     ┆ 40806,     ┆           │
│ Parliament ┆ základě    ┆        ┆  

Now let's perform an alignment check. We will use COMET to check the quality of the alignment.

In [7]:
from huggingface_hub import snapshot_download
from comet import load_from_checkpoint
from tqdm import tqdm

# Load the COMET model
model_path = snapshot_download("Unbabel/wmt22-comet-da")
model_checkpoint_path = f"{model_path}/checkpoints/model.ckpt"
model = load_from_checkpoint(model_checkpoint_path)

def filter_by_quality(df, treshold = 0.4, batch_size = 100):
    """
    Filter out the rows based on the quality of the alignment.
    
    This actually means the quality of the alignment, not the quality of the translation.
    Typical interpretations for the quality of the alignment in COMET:
    < 0: Poor quality
    0 to 0.3: Fair quality
    0.3 to 0.6: Good quality
    > 0.6: Excellent quality
    """
    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Get texts
    cs_clean = df["cs_clean"].to_list()
    en_clean = df["en_clean"].to_list()

    # Prepare data for COMET
    data = [
        {"src": cs, "mt": en, "ref": None}
        for cs, en in zip(cs_clean, en_clean)
    ]
    
    # Get scores from COMET
    predictions = model.predict(data, batch_size=batch_size)
    scores = predictions.scores

    return df.with_columns(
        [pl.Series("quality_score", scores)]
    ).filter(pl.col("quality_score") > treshold)


print(f"Dataset shape: {df_norm.shape}")
df_norm = filter_by_quality(df_norm)
print(f"Filtered dataset shape: {df_norm.shape}")
print(df_norm.head(10))

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`
Encoder model frozen.
/Users/jirka/Library/Caches/pypoetry/virtualenvs/gemma-czech-adaptation-iLYBdvQN-py3.10/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Dataset shape: (780, 9)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 8/8 [00:04<00:00,  1.61it/s]


Filtered dataset shape: (630, 10)
shape: (10, 10)
┌────────────┬────────────┬────────┬────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ en         ┆ cs         ┆ cs_len ┆ en_len ┆ … ┆ en_tokens  ┆ cs_tokens  ┆ similarity ┆ quality_s │
│ ---        ┆ ---        ┆ ---    ┆ ---    ┆   ┆ ---        ┆ ---        ┆ ---        ┆ core      │
│ str        ┆ str        ┆ u32    ┆ u32    ┆   ┆ object     ┆ object     ┆ f32        ┆ ---       │
│            ┆            ┆        ┆        ┆   ┆            ┆            ┆            ┆ f64       │
╞════════════╪════════════╪════════╪════════╪═══╪════════════╪════════════╪════════════╪═══════════╡
│ Action     ┆ Následný   ┆ 57     ┆ 53     ┆ … ┆ tensor([[  ┆ tensor([[2 ┆ 0.800876   ┆ 0.463111  │
│ taken on   ┆ postup na  ┆        ┆        ┆   ┆ 57945,     ┆ 40806,     ┆            ┆           │
│ Parliament ┆ základě    ┆        ┆        ┆   ┆ 39958,     ┆ 1673,      ┆            ┆           │
│ 's r…      ┆ usn…       ┆        ┆     

Now let's look at the distribution of the quality scores.

In [8]:
# Look at distribution
print(
    df_norm.select(
        [
            pl.col("quality_score").quantile(0.25).alias("25th_percentile"),
            pl.col("quality_score").quantile(0.5).alias("median"),
            pl.col("quality_score").quantile(0.75).alias("75th_percentile"),
            pl.col("quality_score").mean().alias("mean"),
        ]
    )
)

shape: (1, 4)
┌─────────────────┬──────────┬─────────────────┬─────────┐
│ 25th_percentile ┆ median   ┆ 75th_percentile ┆ mean    │
│ ---             ┆ ---      ┆ ---             ┆ ---     │
│ f64             ┆ f64      ┆ f64             ┆ f64     │
╞═════════════════╪══════════╪═════════════════╪═════════╡
│ 0.432053        ┆ 0.456948 ┆ 0.52894         ┆ 0.49383 │
└─────────────────┴──────────┴─────────────────┴─────────┘


This means that the quality of the alignment is pretty good for most of the rows.

Now let's calculate the dropout rate.

In [9]:
# Dropout rate is the percentage of rows that are filtered out.
dropout_rate = (len(df) - len(df_norm)) / len(df)
print(f"Dropout rate: {dropout_rate:.2%}")

Dropout rate: 37.00%


Now let's save the dataset to a parquet file for further processing.

In [10]:
# Select only the columns we can save to parquet (no tokens)
processed_df = df_norm.select(
    ["en_clean", "cs_clean", "cs_len", "en_len", "similarity", "quality_score"]
)

# Now save to parquet
processed_df.write_parquet(
    file="data/processed/gemma_cs_clean.parquet", compression="zstd"
)

Now the dataset is ready for the next step. We will use it to train the model.

Before doing so, we will create a dataset format that is suitable for the model instruction format.

In [11]:
def instruction_format(x):
    return f"<start_of_turn>user\nPřelož tento text z angličtiny do češtiny.\n\"{x['en_clean']}\"<end_of_turn>\n<start_of_turn>model\n{x['cs_clean']}<end_of_turn>"

# Now let's create the finetune dataset
train_df = processed_df.select(
    [
        pl.struct(["en_clean", "cs_clean"]).map_elements(
            instruction_format,
            return_dtype=pl.Utf8
        ).alias("instruction"),
        pl.col("en_clean"),
        pl.col("cs_clean"),
    ]
)

print(train_df.head(5))

shape: (5, 3)
┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ instruction         ┆ en_clean                        ┆ cs_clean                        │
│ ---                 ┆ ---                             ┆ ---                             │
│ str                 ┆ str                             ┆ str                             │
╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ <start_of_turn>user ┆ Action taken on Parliament's r… ┆ Následný postup na základě usn… │
│ Přelož ten…         ┆                                 ┆                                 │
│ <start_of_turn>user ┆ Documents received: see Minute… ┆ Předložení dokumentů: viz zápi… │
│ Přelož ten…         ┆                                 ┆                                 │
│ <start_of_turn>user ┆ Written statements (Rule 116):… ┆ Písemná prohlášení (článek 116… │
│ Přelož ten…         ┆                                 ┆         

Now the dataset is ready for the next step. We will use it to train the model.

In [12]:
from sklearn.model_selection import train_test_split

# First split into train and temp (80/20)
train_df_main, temp_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Then split temp into validation and test (50/50, resulting in 10/10 of original)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save all splits to parquet files
train_df_main.write_parquet(
    file="data/processed/gemma_cs_train.parquet", compression="zstd"
)
valid_df.write_parquet(
    file="data/processed/gemma_cs_valid.parquet", compression="zstd"
)
test_df.write_parquet(
    file="data/processed/gemma_cs_test.parquet", compression="zstd"
)

print(f"Train size: {len(train_df_main)}")
print(f"Validation size: {len(valid_df)}")
print(f"Test size: {len(test_df)}")


Train size: 504
Validation size: 63
Test size: 63


# Load the model

In [1]:
import keras
import keras_nlp

model_id = "gemma2_instruct_2b_en"

gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(model_id)
gemma_lm.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gemma2/keras/gemma2_instruct_2b_en/1/download/assets/tokenizer/vocabulary.spm...


100%|██████████| 4.04M/4.04M [00:00<00:00, 4.53MB/s]
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


# LoRA Fine-tuning