In [19]:
#import shutil
#shutil.rmtree('/content/sample_data', ignore_errors=True)

# DistilBERT Tagalog

In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import tempfile
import subprocess
import sys

#  IMPORT LIBRARIES AND SETUP

In [21]:
def install_requirements():
    required_packages = [
        'transformers[torch]',
        'datasets',
        'torch',
        'pandas',
        'scikit-learn',
        'onnx',
        'onnxruntime',
        'optimum[onnxruntime]',  # This enables ORTModelForSequenceClassification
        'tensorflow',
    ]

    for package in required_packages:
        try:
            __import__(package.split('[')[0])
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

In [22]:
print("Checking and installing dependencies...")
install_requirements()

Checking and installing dependencies...
Installing scikit-learn...


In [23]:
try:
    from optimum.onnxruntime import ORTModelForSequenceClassification
    from optimum.onnxruntime.configuration import OptimizationConfig
    ONNX_AVAILABLE = True
except ImportError:
    print("Optimum ONNX Runtime not available, ONNX export will be limited")
    ONNX_AVAILABLE = False

Optimum ONNX Runtime not available, ONNX export will be limited


In [24]:
print("All dependencies loaded successfully!")

All dependencies loaded successfully!


# LOAD AND PREPARE DATASET

In [25]:
def load_dataset(csv_path='dataset.csv'):
    print(f"Loading dataset from {csv_path}...")

    try:
        df = pd.read_csv(csv_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")

        # Validate required columns
        required_cols = ['text', 'label']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        # Clean and validate data
        df = df.dropna(subset=['text', 'label'])
        df['text'] = df['text'].astype(str)
        df['label'] = df['label'].astype(int)

        # Check label range
        unique_labels = sorted(df['label'].unique())
        num_labels = len(unique_labels)

        # Ensure labels are sequential starting from 0
        if unique_labels != list(range(num_labels)):
            print("Warning: Labels are not sequential starting from 0. Remapping...")
            label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}
            df['label'] = df['label'].map(label_mapping)
            print(f"Label mapping: {label_mapping}")

        print(f"Dataset validation complete. Clean shape: {df.shape}")
        print(f"Number of classes: {num_labels}")
        print(f"Label distribution:\n{df['label'].value_counts().sort_index()}")

        return df

    except FileNotFoundError:
        print(f"Error: Dataset file '{csv_path}' not found!")
        print("Creating a sample Tagalog dataset for demonstration...")
        return create_sample_dataset()

In [26]:
def create_sample_dataset():
    sample_data = {
        'text': [
            # Positive sentiment (label 0) - Tagalog
            'Napakaganda ng produktong ito, sulit na sulit!',
            'Sobrang galing ng serbisyo nila, highly recommended!',
            'Magandang kalidad at mabilis na delivery.',
            'Napakahusay ng customer support, user-friendly pa.',
            'Outstanding performance at great value for money.',
            'Masarap ang pagkain dito, babalik ako ulit.',
            'Magaling ang mga empleyado, napakabait.',
            'Sulit ang binayad ko, satisfied ako sa quality.',
            'Excellent ang service, walang reklamo.',
            'Napakaganda ng lugar, perfect para sa family.',
            'Masayang experience, hindi ako nagsisi.',
            'Mataas ang kalidad, worth it ang presyo.',
            'Napakabilis ng delivery, thank you!',
            'Sobrang ganda ng design, modern pa.',
            'Napakasarap ng lasa, uulitin ko to.',

            # Neutral sentiment (label 1) - Tagalog
            'Okay lang ang produkto, walang masama.',
            'Average lang ang kalidad, sakto sa presyo.',
            'Gumagana naman, walang problema.',
            'Standard features, typical sa price range na to.',
            'Normal lang ang delivery time, okay naman.',
            'Hindi masama, hindi rin maganda.',
            'Pwede na, sulit naman sa presyo.',
            'Okay lang ang service, walang special.',
            'Hindi ako disappointed, hindi rin excited.',
            'Sakto lang, meets expectations.',
            'Average ang lasa, hindi masama.',
            'Okay ang quality, walang problema.',
            'Standard lang ang service, normal.',
            'Hindi special pero okay naman.',
            'Pwede na, acceptable naman.',

            # Negative sentiment (label 2) - Tagalog
            'Sobrang pangit ng produkto, sayang ang pera.',
            'Mababa ang kalidad, nasira agad after one day.',
            'Napakasama ng customer service, worst ever.',
            'Sobrang bagal ng delivery at sira pa ang packaging.',
            'Hindi ko irerekumenda, very disappointing.',
            'Terrible ang experience, hindi sulit.',
            'Pangit ang lasa, hindi ko naubos.',
            'Mahal tapos pangit pa ang kalidad.',
            'Sobrang tagal ng antay, nainis ako.',
            'Hindi maganda ang service, rude pa ang staff.',
            'Nasira agad, walang kwentang produkto.',
            'Masamang experience, hindi ako babalik.',
            'Sobrang disappointing, sayang ang time.',
            'Pangit ang quality control, maraming defects.',
            'Hindi worth it ang presyo, panget pa.',
        ],
        'label': (
            [0] * 15 +  # Positive samples
            [1] * 15 +  # Neutral samples
            [2] * 15    # Negative samples
        )
    }

    df = pd.DataFrame(sample_data)
    df.to_csv('dataset.csv', index=False)
    print("Sample Tagalog sentiment dataset created and saved as 'dataset.csv'")
    print("Dataset includes various Tagalog text samples for sentiment classification")
    print("Labels: 0=Positive, 1=Neutral, 2=Negative")
    return df

In [27]:
def split_dataset(df, test_size=0.2, random_state=42):
    print(f"Splitting dataset: {test_size*100}% for validation...")

    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(),
        df['label'].tolist(),
        test_size=test_size,
        random_state=random_state,
        stratify=df['label']
    )

    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")

    return X_train, X_val, y_train, y_val

# TOKENIZATION AND PREPROCESSING

In [28]:
def create_tokenized_datasets(X_train, X_val, y_train, y_val, model_name):
    """Tokenize the datasets using the DistilBERT Tagalog tokenizer"""
    print("Loading DistilBERT Tagalog tokenizer and creating tokenized datasets...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # DistilBERT uses BERT-style tokenization
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.unk_token if tokenizer.unk_token is not None else '[PAD]'

    # Create datasets
    train_dataset = Dataset.from_dict({
        'text': X_train,
        'labels': y_train
    })

    val_dataset = Dataset.from_dict({
        'text': X_val,
        'labels': y_val
    })

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=False,  # Will be handled by data collator
            max_length=256  # Increased for Tagalog text which can be longer
        )

    # Tokenize datasets
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    val_tokenized = val_dataset.map(tokenize_function, batched=True)

    print("Tokenization complete!")
    print(f"Sample tokenized text length: {len(train_tokenized[0]['input_ids'])}")

    return train_tokenized, val_tokenized, tokenizer

# MODEL TRAINING

In [29]:
def train_model(train_dataset, val_dataset, tokenizer, model_name, output_dir, num_labels):
    """Train the DistilBERT Tagalog model for classification"""
    print("Initializing DistilBERT Tagalog model for classification training...")

    # Load model with appropriate number of labels
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="single_label_classification"
    )

    # Ensure model uses the correct pad_token_id
    if tokenizer.pad_token_id is not None:
        model.config.pad_token_id = tokenizer.pad_token_id

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments optimized for DistilBERT Tagalog
    training_args_dict = {
        'output_dir': output_dir,
        'num_train_epochs': 3,  # Standard epochs for DistilBERT
        'per_device_train_batch_size': 8,  # Smaller batch size (DistilBERT is larger than XtremeDistil)
        'per_device_eval_batch_size': 8,
        'learning_rate': 3e-5,  # Standard learning rate for DistilBERT
        'weight_decay': 0.01,
        'warmup_ratio': 0.1,
        'logging_dir': f'{output_dir}/logs',
        'logging_steps': 10,
        'save_total_limit': 2,
        'load_best_model_at_end': True,
        'metric_for_best_model': "eval_f1_macro",
        'greater_is_better': True,
        'report_to': [],
        'seed': 42,
        'dataloader_num_workers': 0,
        'remove_unused_columns': True,
        'fp16': True,  # Mixed precision for efficiency
        'dataloader_pin_memory': False,
        'gradient_checkpointing': True,  # Enable for memory efficiency
    }

    # Add version-specific parameters
    if hasattr(TrainingArguments, 'eval_strategy'):
        training_args_dict['eval_strategy'] = "epoch"
        training_args_dict['save_strategy'] = "epoch"
    else:
        training_args_dict['evaluation_strategy'] = "epoch"
        training_args_dict['save_strategy'] = "epoch"

    training_args = TrainingArguments(**training_args_dict)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Handle different prediction formats
        if isinstance(predictions, tuple):
            predictions = predictions[0]

        # Convert to numpy array if it's a tensor
        if hasattr(predictions, 'numpy'):
            predictions = predictions.numpy()

        predictions = np.argmax(predictions, axis=1)

        # Calculate metrics
        accuracy = accuracy_score(labels, predictions)
        f1_macro = f1_score(labels, predictions, average='macro')
        f1_weighted = f1_score(labels, predictions, average='weighted')

        return {
            'accuracy': accuracy,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
        }

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("Starting training...")
    trainer.train()

    # Save the best model
    print(f"Saving model to {output_dir}...")
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    return trainer, model

# MODEL EVALUATION

In [30]:
# Replace the evaluate_model function with this corrected version
def evaluate_model(trainer, X_val, y_val, output_dir, num_labels):
    print("Evaluating model performance...")
    import torch
    # Get predictions
    eval_results = trainer.evaluate()

    # Get detailed predictions for classification report
    predictions = trainer.predict(trainer.eval_dataset)

    # Extract predictions from the prediction object
    if hasattr(predictions, 'predictions'):
        preds = predictions.predictions
    else:
        preds = predictions[0]

    # Handle tuple output (logits, other outputs)
    if isinstance(preds, tuple):
        # Take the first element which should be the logits
        preds = preds[0]

    # Convert to numpy array if it's a tensor
    if hasattr(preds, 'numpy'):
        preds = preds.numpy()
    elif torch.is_tensor(preds):
        preds = preds.cpu().numpy()

    y_pred = np.argmax(preds, axis=1)

    # Generate classification report
    if num_labels == 2:
        target_names = ['Safe', 'NSFW']
    elif num_labels == 3:
        target_names = ['Positive', 'Neutral', 'Negative']
    else:
        target_names = [f'Class_{i}' for i in range(num_labels)]

    report = classification_report(
        y_val,
        y_pred,
        target_names=target_names,
        digits=4
    )

    # Calculate additional metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1_macro = f1_score(y_val, y_pred, average='macro')
    f1_weighted = f1_score(y_val, y_pred, average='weighted')

    # Prepare metrics text
    metrics_text = f"""DistilBERT Tagalog Classification Model Evaluation Results
{'='*70}

Model: jcblaise/distilbert-tagalog-base-cased
Task: Tagalog Text Classification ({num_labels} classes)
Architecture: DistilBERT (6 layers, 768 hidden units) - Tagalog optimized

Performance Metrics:
{'-'*30}
Accuracy: {accuracy:.4f}
F1-Score (Macro): {f1_macro:.4f}
F1-Score (Weighted): {f1_weighted:.4f}

Classification Report:
{report}

Training Results:
{'-'*30}
"""

    for key, value in eval_results.items():
        if isinstance(value, (int, float)):
            metrics_text += f"{key}: {value:.4f}\n"

    # Save metrics
    os.makedirs('metrics', exist_ok=True)
    metrics_path = 'metrics/distilbert_tagalog_metrics.txt'

    with open(metrics_path, 'w') as f:
        f.write(metrics_text)

    print(f"Metrics saved to {metrics_path}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"F1-Score (Macro): {f1_macro:.4f}")

    return accuracy, report

# ONNX EXPORT

In [31]:
def export_to_onnx(model_dir, onnx_path):
    print("Exporting DistilBERT Tagalog model to ONNX format...")

    try:
        # Load the trained model
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)

        # Create dummy input with Tagalog sample
        dummy_input = tokenizer(
            "Ito ay isang sample na Tagalog text para sa ONNX export",
            return_tensors="pt",
            max_length=256,
            padding="max_length",
            truncation=True
        )

        # Export to ONNX with optimization for DistilBERT
        os.makedirs(os.path.dirname(onnx_path), exist_ok=True)
        torch.onnx.export(
            model,
            tuple(dummy_input.values()),
            onnx_path,
            export_params=True,
            opset_version=14,
            do_constant_folding=True,
            input_names=['input_ids', 'attention_mask'],
            output_names=['logits'],
            dynamic_axes={
                'input_ids': {0: 'batch_size', 1: 'sequence'},
                'attention_mask': {0: 'batch_size', 1: 'sequence'},
                'logits': {0: 'batch_size'}
            }
        )

        print(f"ONNX model exported to: {onnx_path}")

        # Get model size
        model_size = os.path.getsize(onnx_path) / (1024 * 1024)
        print(f"ONNX model size: {model_size:.2f} MB")

        return True

    except Exception as e:
        print(f"ONNX export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

# TENSORFLOW LITE EXPORT

In [32]:
def export_to_tflite_from_pt(model_dir, tflite_path):
    try:
        import tensorflow as tf
        from transformers import TFAutoModelForSequenceClassification

        print("Converting DistilBERT Tagalog PyTorch model to TensorFlow...")

        # Load and convert to TensorFlow
        tf_model = TFAutoModelForSequenceClassification.from_pretrained(
            model_dir,
            from_pt=True
        )

        # Save as TensorFlow SavedModel
        tf_saved_model_dir = os.path.join(model_dir, "tf_saved_model")
        tf.saved_model.save(tf_model, tf_saved_model_dir)
        print(f"Saved intermediate TensorFlow model to {tf_saved_model_dir}")

        # Convert to TFLite with optimizations
        converter = tf.lite.TFLiteConverter.from_saved_model(tf_saved_model_dir)
        converter.optimizations = [tf.lite.Optimize.DEFAULT]

        # Additional optimizations for mobile deployment
        converter.target_spec.supported_types = [tf.float16]

        tflite_model = converter.convert()

        # Save TFLite model
        os.makedirs(os.path.dirname(tflite_path), exist_ok=True)
        with open(tflite_path, "wb") as f:
            f.write(tflite_model)

        # Get model size
        model_size = os.path.getsize(tflite_path) / (1024 * 1024)
        print(f"TFLite model successfully exported to: {tflite_path}")
        print(f"TFLite model size: {model_size:.2f} MB")

        return True

    except Exception as e:
        print(f"TensorFlow Lite export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

# MAIN

In [33]:
def main():
    print("Starting DistilBERT Tagalog Text Classification Pipeline")
    print("="*70)

    # Configuration - Updated to use DistilBERT Tagalog model
    MODEL_NAME = "jcblaise/distilbert-tagalog-base-cased"
    OUTPUT_DIR = "models/distilbert_tagalog_classification"
    ONNX_PATH = "models/distilbert_tagalog_classification_model.onnx"
    TFLITE_PATH = "models/distilbert_tagalog_classification_model.tflite"

    # Create output directories
    os.makedirs("models", exist_ok=True)
    os.makedirs("metrics", exist_ok=True)

    print(f"Using model: {MODEL_NAME}")
    print(f"Output directory: {OUTPUT_DIR}")

    # Step 1: Load dataset
    df = load_dataset()
    num_labels = len(df['label'].unique())

    # Step 2: Split dataset
    X_train, X_val, y_train, y_val = split_dataset(df)

    # Step 3: Create tokenized datasets
    train_dataset, val_dataset, tokenizer = create_tokenized_datasets(
        X_train, X_val, y_train, y_val, MODEL_NAME
    )

    # Step 4: Train model
    trainer, model = train_model(
        train_dataset, val_dataset, tokenizer, MODEL_NAME, OUTPUT_DIR, num_labels
    )

    # Step 5: Evaluate model
    accuracy, report = evaluate_model(trainer, X_val, y_val, OUTPUT_DIR, num_labels)

    # Step 6: Export to ONNX
    onnx_success = export_to_onnx(OUTPUT_DIR, ONNX_PATH)

    # Step 7: Export to TFLite
    tflite_success = export_to_tflite_from_pt(OUTPUT_DIR, TFLITE_PATH)

    # Final output
    print("\n" + "="*70)
    print("DistilBERT Tagalog Classification Training Complete!")

    if onnx_success:
        print(f"✅ ONNX model: {ONNX_PATH}")
    else:
        print("❌ ONNX export: FAILED")

    if tflite_success:
        print(f"✅ TFLite model: {TFLITE_PATH}")
    else:
        print("❌ TFLite export: FAILED")

    print(f"\nModel checkpoints: {OUTPUT_DIR}")
    print(f"Metrics: metrics/distilbert_tagalog_metrics.txt")
    print(f"Final validation accuracy: {accuracy:.4f}")

# INFERENCE

In [34]:
def test_inference(model_dir, test_texts=None):
    if test_texts is None:
        test_texts = [
            # Positive sentiment
            "Napakaganda ng produktong ito, sulit na sulit!",
            "Sobrang galing ng serbisyo nila, highly recommended!",
            "Magandang kalidad at mabilis na delivery.",
            "Napakahusay ng customer support, user-friendly pa.",
            "Masarap ang pagkain dito, babalik ako ulit.",

            # Neutral sentiment
            "Okay lang ang produkto, walang masama.",
            "Average lang ang kalidad, sakto sa presyo.",
            "Gumagana naman, walang problema.",
            "Standard features, typical sa price range na to.",
            "Normal lang ang delivery time, okay naman.",

            # Negative sentiment
            "Sobrang pangit ng produkto, sayang ang pera.",
            "Mababa ang kalidad, nasira agad after one day.",
            "Napakasama ng customer service, worst ever.",
            "Sobrang bagal ng delivery at sira pa ang packaging.",
            "Hindi ko irerekumenda, very disappointing.",
        ]

    print("\nTesting trained DistilBERT Tagalog model...")

    try:
        # Load model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)

        model.eval()

        # Determine label names based on number of labels
        num_labels = model.config.num_labels
        if num_labels == 2:
            label_names = ["Safe", "NSFW"]
        elif num_labels == 3:
            label_names = ["Positive", "Neutral", "Negative"]
        else:
            label_names = [f"Class_{i}" for i in range(num_labels)]

        for i, text in enumerate(test_texts):
            # Tokenize
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=256,
                padding=True
            )

            # Predict
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                predicted_class = torch.argmax(predictions, dim=-1).item()
                confidence = predictions[0][predicted_class].item()

            # Map predictions to labels
            label = label_names[predicted_class] if predicted_class < len(label_names) else f"Class_{predicted_class}"

            print(f"Text {i+1}: '{text}'")
            print(f"  -> {label} (confidence: {confidence:.4f})")

            # Show all probabilities
            probs_str = ", ".join([f"{label_names[j] if j < len(label_names) else f'Class_{j}'}={predictions[0][j].item():.3f}"
                                 for j in range(num_labels)])
            print(f"  Probabilities: {probs_str}")
            print()

    except Exception as e:
        print(f"Inference test failed: {e}")
        import traceback
        traceback.print_exc()

# PROGRAM EXECUTION

In [35]:
if __name__ == "__main__":
    try:
        # Run the main pipeline
        main()

        # Optional: Test inference
        test_inference("models/distilbert_tagalog_classification")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user.")
    except Exception as e:
        print(f"Error during execution: {e}")
        import traceback
        traceback.print_exc()

    print("\nProgram execution completed.")

Starting DistilBERT Tagalog Text Classification Pipeline
Using model: jcblaise/distilbert-tagalog-base-cased
Output directory: models/distilbert_tagalog_classification
Loading dataset from dataset.csv...
Dataset loaded successfully. Shape: (45, 2)
Dataset validation complete. Clean shape: (45, 2)
Number of classes: 3
Label distribution:
label
0    15
1    15
2    15
Name: count, dtype: int64
Splitting dataset: 20.0% for validation...
Training set: 36 samples
Validation set: 9 samples
Loading DistilBERT Tagalog tokenizer and creating tokenized datasets...


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Tokenization complete!
Sample tokenized text length: 14
Initializing DistilBERT Tagalog model for classification training...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at jcblaise/distilbert-tagalog-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,No log,1.074002,0.777778,0.738095,0.738095
2,1.085100,1.065864,0.666667,0.64127,0.64127
3,1.085100,1.061035,0.555556,0.516667,0.516667


Saving model to models/distilbert_tagalog_classification...
Evaluating model performance...


Metrics saved to metrics/distilbert_tagalog_metrics.txt
Validation Accuracy: 0.7778
F1-Score (Macro): 0.7381
Exporting DistilBERT Tagalog model to ONNX format...


  torch.onnx.export(
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


ONNX model exported to: models/distilbert_tagalog_classification_model.onnx
ONNX model size: 254.31 MB
Converting DistilBERT Tagalog PyTorch model to TensorFlow...


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Saved intermediate TensorFlow model to models/distilbert_tagalog_classification/tf_saved_model
TFLite model successfully exported to: models/distilbert_tagalog_classification_model.tflite
TFLite model size: 127.24 MB

DistilBERT Tagalog Classification Training Complete!
✅ ONNX model: models/distilbert_tagalog_classification_model.onnx
✅ TFLite model: models/distilbert_tagalog_classification_model.tflite

Model checkpoints: models/distilbert_tagalog_classification
Metrics: metrics/distilbert_tagalog_metrics.txt
Final validation accuracy: 0.7778

Testing trained DistilBERT Tagalog model...
Text 1: 'Napakaganda ng produktong ito, sulit na sulit!'
  -> Positive (confidence: 0.3451)
  Probabilities: Positive=0.345, Neutral=0.322, Negative=0.333

Text 2: 'Sobrang galing ng serbisyo nila, highly recommended!'
  -> Positive (confidence: 0.3593)
  Probabilities: Positive=0.359, Neutral=0.297, Negative=0.344

Text 3: 'Magandang kalidad at mabilis na delivery.'
  -> Positive (confidence: 0.3609)
