In [None]:
import shutil
shutil.rmtree('/content/models', ignore_errors=True)

# RoBERTa Tagalog for NSFW word detection

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import tempfile
import subprocess
import sys

# IMPORT LIBRARIES AND SETUP

In [None]:
def install_requirements():
    required_packages = [
        'transformers[torch]',
        'datasets',
        'torch',
        'pandas',
        'scikit-learn',
        'onnx',
        'onnxruntime',
        'optimum[onnxruntime]',  # This enables ORTModelForSequenceClassification
        'tensorflow',
    ]

    for package in required_packages:
        try:
            __import__(package.split('[')[0])
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

In [None]:
print("Checking and installing dependencies...")
install_requirements()

Checking and installing dependencies...
Installing scikit-learn...


In [None]:
try:
    from optimum.onnxruntime import ORTModelForSequenceClassification
    from optimum.onnxruntime.configuration import OptimizationConfig
    ONNX_AVAILABLE = True
except ImportError:
    print("Optimum ONNX Runtime not available, ONNX export will be limited")
    ONNX_AVAILABLE = False

Optimum ONNX Runtime not available, ONNX export will be limited


In [None]:
print("All dependencies loaded successfully!")

All dependencies loaded successfully!


# LOAD AND PREPARE DATASET

In [None]:
def load_dataset(csv_path='dataset.csv'):
    print(f"Loading dataset from {csv_path}...")

    try:
        df = pd.read_csv(csv_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")

        # Validate required columns
        required_cols = ['text', 'label']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        # Clean and validate data
        df = df.dropna(subset=['text', 'label'])
        df['text'] = df['text'].astype(str)
        df['label'] = df['label'].astype(int)

        # Validate labels are binary (0, 1) for NSFW detection
        unique_labels = df['label'].unique()
        if not all(label in [0, 1] for label in unique_labels):
            raise ValueError("Labels must be 0 (safe) or 1 (nsfw)")

        print(f"Dataset validation complete. Clean shape: {df.shape}")
        print(f"Label distribution:\n{df['label'].value_counts()}")

        return df

    except FileNotFoundError:
        print(f"Error: Dataset file '{csv_path}' not found!")
        print("Creating a sample dataset for demonstration...")
        return create_sample_dataset()

In [None]:
def create_sample_dataset():
    sample_data = {
        'text': [
            # Safe content (Tagalog examples) - 30 examples
            'Magandang umaga sa lahat', 'Kumusta kayo ngayong araw', 'Salamat sa inyong tulong',
            'Masayang pagdating ng bagong taon', 'Ang ganda ng bulaklak sa hardin',
            'Nag-aaral ako ng Filipino', 'Masarap ang pagkain sa kusina', 'Magandang morning',
            'Magandang panahon ngayon', 'Nagbabasa ako ng libro', 'Maligayang kaarawan',
            'Ang sarap ng mangga', 'Nood tayo ng pelikula', 'Magandang gabi sa inyo',
            'Ang galing ng mga estudyante', 'Masaya sa probinsya', 'Maayos na klase ngayon',
            'Magandang simula ng linggo', 'Ang husay ng mga guro', 'Masayang samahan',
            'Good morning everyone', 'How are you today', 'Thank you for your help',
            'Beautiful flowers in the garden', 'I am studying', 'Nice weather today',
            'Reading a good book', 'Happy birthday', 'Delicious mango', 'Watching movies',
            # NSFW content (placeholder examples) - 20 examples
            'inappropriate tagalog content 1', 'inappropriate content 2', 'bad words example 3',
            'inappropriate tagalog 4', 'nsfw content 5', 'inappropriate 6',
            'bad content 7', 'inappropriate tagalog 8', 'nsfw example 9',
            'inappropriate content 10', 'bad words 11', 'inappropriate 12',
            'nsfw tagalog 13', 'inappropriate content 14', 'bad example 15',
            'inappropriate 16', 'nsfw content 17', 'bad words tagalog 18',
            'inappropriate content 19', 'nsfw example 20'
        ],
        'label': [0] * 30 + [1] * 20  # 30 safe, 20 nsfw
    }

    df = pd.DataFrame(sample_data)
    df.to_csv('dataset.csv', index=False)
    print("Sample dataset created and saved as 'dataset.csv'")
    print("Note: Replace placeholder NSFW examples with actual data for real training")
    return df

In [None]:
def split_dataset(df, test_size=0.2, random_state=42):
    print(f"Splitting dataset: {test_size*100}% for validation...")

    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(),
        df['label'].tolist(),
        test_size=test_size,
        random_state=random_state,
        stratify=df['label']
    )

    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")

    return X_train, X_val, y_train, y_val

# TOKENIZATION AND PREPROCESSING

In [None]:
def create_tokenized_datasets(X_train, X_val, y_train, y_val, model_name):
    """Tokenize the datasets using the RoBERTa Tagalog tokenizer"""
    print("Loading RoBERTa Tagalog tokenizer and creating tokenized datasets...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Check if tokenizer has pad_token, add if missing
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Create datasets
    train_dataset = Dataset.from_dict({
        'text': X_train,
        'labels': y_train
    })

    val_dataset = Dataset.from_dict({
        'text': X_val,
        'labels': y_val
    })

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=False,  # Will be handled by data collator
            max_length=256  # Increased for better Tagalog sentence handling
        )

    # Tokenize datasets
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    val_tokenized = val_dataset.map(tokenize_function, batched=True)

    print("Tokenization complete!")
    return train_tokenized, val_tokenized, tokenizer

# MODEL TRAINING



In [None]:
def train_model(train_dataset, val_dataset, tokenizer, model_name, output_dir):
    """Train the RoBERTa Tagalog model for NSFW detection"""
    print("Initializing RoBERTa Tagalog model for NSFW detection training...")

    # Load model with binary classification (NSFW vs Safe)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label={0: "Safe", 1: "NSFW"},
        label2id={"Safe": 0, "NSFW": 1}
    )

    # Ensure model uses the correct pad_token_id
    if tokenizer.pad_token_id is not None:
        model.config.pad_token_id = tokenizer.pad_token_id

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Training arguments optimized for RoBERTa Tagalog
    training_args_dict = {
        'output_dir': output_dir,
        'num_train_epochs': 3,  # Reduced epochs for fine-tuning
        'per_device_train_batch_size': 8,  # Smaller batch size for stability
        'per_device_eval_batch_size': 8,
        'learning_rate': 3e-5,  # Slightly higher learning rate for RoBERTa
        'weight_decay': 0.01,
        'warmup_steps': 100,  # Warmup for better training stability
        'logging_dir': f'{output_dir}/logs',
        'logging_steps': 10,
        'save_total_limit': 2,
        'load_best_model_at_end': True,
        'metric_for_best_model': "eval_loss",
        'greater_is_better': False,
        'report_to': [],
        'seed': 42,
        'dataloader_num_workers': 0,
        'remove_unused_columns': True,
        'fp16': True,  # Enable mixed precision for efficiency
    }

    # Add version-specific parameters
    if hasattr(TrainingArguments, 'eval_strategy'):
        # New version
        training_args_dict['eval_strategy'] = "epoch"
        training_args_dict['save_strategy'] = "epoch"
    else:
        # Old version
        training_args_dict['evaluation_strategy'] = "epoch"
        training_args_dict['save_strategy'] = "epoch"

    training_args = TrainingArguments(**training_args_dict)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # Handle different prediction formats
        if isinstance(predictions, tuple):
            # If predictions is a tuple, take the first element (logits)
            predictions = predictions[0]

        # Convert to numpy array if it's a tensor
        if hasattr(predictions, 'numpy'):
            predictions = predictions.numpy()

        predictions = np.argmax(predictions, axis=1)

        return {
            'accuracy': accuracy_score(labels, predictions),
        }

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("Starting training...")
    trainer.train()

    # Save the best model
    print(f"Saving model to {output_dir}...")
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    return trainer, model

# MODEL EVALUATION

In [None]:
def evaluate_model(trainer, X_val, y_val, output_dir):
    print("Evaluating model performance...")

    try:
        # Get predictions using trainer.predict
        predictions_output = trainer.predict(trainer.eval_dataset)

        # Debug: Print the structure of predictions
        print(f"Debug: Predictions structure type: {type(predictions_output)}")

        # Handle different prediction formats
        if hasattr(predictions_output, 'predictions'):
            preds = predictions_output.predictions
            print(f"Debug: predictions shape: {np.array(preds).shape if isinstance(preds, np.ndarray) else 'not array'}")
        else:
            # Handle tuple format
            preds = predictions_output[0] if isinstance(predictions_output, tuple) else predictions_output
            print(f"Debug: extracted predictions shape: {np.array(preds).shape if isinstance(preds, np.ndarray) else 'not array'}")

        # Convert to numpy array properly
        if hasattr(preds, 'numpy'):
            preds = preds.numpy()
        elif not isinstance(preds, np.ndarray):
            preds = np.array(preds)

        print(f"Debug: Final preds shape: {preds.shape}")
        print(f"Debug: First few predictions: {preds[:2] if len(preds) > 0 else 'empty'}")

        # Ensure preds is 2D for argmax
        if preds.ndim == 1:
            print("Warning: Predictions are 1D, assuming binary classification with single logits")
            # For binary classification, if we have single values, convert to probabilities
            y_pred = (preds > 0).astype(int)
        else:
            # Normal case: 2D array with logits/probabilities for each class
            y_pred = np.argmax(preds, axis=1)

        print(f"Debug: y_pred shape: {y_pred.shape}")
        print(f"Debug: y_val length: {len(y_val)}")

        # Get evaluation results from trainer
        eval_results = trainer.evaluate()

        # Generate classification report
        report = classification_report(
            y_val,
            y_pred,
            target_names=['Safe', 'NSFW'],
            digits=4
        )

        # Calculate additional metrics
        accuracy = accuracy_score(y_val, y_pred)

        # Prepare metrics text
        metrics_text = f"""RoBERTa Tagalog NSFW Detection Model Evaluation Results
{'='*60}

Model: danjohnvelasco/roberta-tagalog-base-cohfie-v1
Task: Binary Classification (Safe vs NSFW)

Accuracy: {accuracy:.4f}

Classification Report:
{report}

Training Results:
{'-'*30}
"""

        for key, value in eval_results.items():
            metrics_text += f"{key}: {value:.4f}\n"

        # Save metrics
        os.makedirs('metrics', exist_ok=True)
        metrics_path = 'metrics/metrics.txt'

        with open(metrics_path, 'w') as f:
            f.write(metrics_text)

        print(f"Metrics saved to {metrics_path}")
        print(f"Validation Accuracy: {accuracy:.4f}")

        return accuracy, report

    except Exception as e:
        print(f"Error in evaluation: {e}")
        import traceback
        traceback.print_exc()

        # Fallback evaluation using manual prediction
        print("Attempting fallback evaluation...")
        return evaluate_model_fallback(trainer, X_val, y_val, output_dir)

In [None]:
def evaluate_model_fallback(trainer, X_val, y_val, output_dir):
    """Fallback evaluation method using manual prediction"""
    print("Using fallback evaluation method...")

    try:
        model = trainer.model
        tokenizer = trainer.tokenizer
        model.eval()

        predictions = []

        for text in X_val:
            # Tokenize each text individually
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=256,
                padding=True
            )

            # Move to same device as model
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                pred = torch.argmax(logits, dim=-1).cpu().numpy()[0]
                predictions.append(pred)

        y_pred = np.array(predictions)

        # Calculate accuracy
        accuracy = accuracy_score(y_val, y_pred)

        # Generate classification report
        report = classification_report(
            y_val,
            y_pred,
            target_names=['Safe', 'NSFW'],
            digits=4
        )

        # Prepare metrics text
        metrics_text = f"""RoBERTa Tagalog NSFW Detection Model Evaluation Results (Fallback)
{'='*60}

Model: danjohnvelasco/roberta-tagalog-base-cohfie-v1
Task: Binary Classification (Safe vs NSFW)

Accuracy: {accuracy:.4f}

Classification Report:
{report}
"""

        # Save metrics
        os.makedirs('metrics', exist_ok=True)
        metrics_path = 'metrics/metrics.txt'

        with open(metrics_path, 'w') as f:
            f.write(metrics_text)

        print(f"Metrics saved to {metrics_path}")
        print(f"Validation Accuracy: {accuracy:.4f}")

        return accuracy, report

    except Exception as e:
        print(f"Fallback evaluation also failed: {e}")
        return 0.0, "Evaluation failed"


# ONNX EXPORT

In [None]:
def export_to_onnx(model_dir, onnx_path):
    print("Exporting RoBERTa Tagalog model to ONNX format...")

    try:
        # Load the trained model
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)

        # Create dummy input with sample Tagalog text
        dummy_input = tokenizer(
            "Magandang umaga sa lahat",
            return_tensors="pt",
            max_length=256,
            padding="max_length",
            truncation=True
        )

        # Export to ONNX with optimization for RoBERTa
        os.makedirs(os.path.dirname(onnx_path), exist_ok=True)
        torch.onnx.export(
            model,
            tuple(dummy_input.values()),
            onnx_path,
            export_params=True,
            opset_version=14,
            do_constant_folding=True,
            input_names=['input_ids', 'attention_mask'],
            output_names=['logits'],
            dynamic_axes={
                'input_ids': {0: 'batch_size', 1: 'sequence'},
                'attention_mask': {0: 'batch_size', 1: 'sequence'},
                'logits': {0: 'batch_size'}
            }
        )

        print(f"ONNX model exported to: {onnx_path}")

        # Get model size
        model_size = os.path.getsize(onnx_path) / (1024 * 1024)
        print(f"ONNX model size: {model_size:.2f} MB")

        return True

    except Exception as e:
        print(f"ONNX export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

# TENSORFLOW LITE EXPORT

In [None]:
def export_to_tflite_from_pt(model_dir, tflite_path):
    try:
        import tensorflow as tf
        from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
        import os

        print("Converting RoBERTa Tagalog PyTorch model to TensorFlow...")

        # Load and convert to TensorFlow
        tf_model = TFAutoModelForSequenceClassification.from_pretrained(
            model_dir,
            from_pt=True  # convert from PyTorch
        )

        # Save as TensorFlow SavedModel
        tf_saved_model_dir = os.path.join(model_dir, "tf_saved_model")
        tf.saved_model.save(tf_model, tf_saved_model_dir)
        print(f"Saved intermediate TensorFlow model to {tf_saved_model_dir}")

        # Convert to TFLite with optimizations
        converter = tf.lite.TFLiteConverter.from_saved_model(tf_saved_model_dir)
        converter.optimizations = [tf.lite.Optimize.DEFAULT]

        # Additional optimizations for mobile deployment
        converter.target_spec.supported_types = [tf.float16]

        tflite_model = converter.convert()

        # Save TFLite model
        os.makedirs(os.path.dirname(tflite_path), exist_ok=True)
        with open(tflite_path, "wb") as f:
            f.write(tflite_model)

        # Get model size
        model_size = os.path.getsize(tflite_path) / (1024 * 1024)
        print(f"TFLite model successfully exported to: {tflite_path}")
        print(f"TFLite model size: {model_size:.2f} MB")

        return True

    except Exception as e:
        print(f"TensorFlow Lite export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

# MAIN

In [None]:
def main():
    print("Starting RoBERTa Tagalog NSFW Detection Model Training Pipeline")
    print("="*70)

    # Configuration - Updated to use RoBERTa Tagalog model
    MODEL_NAME = "danjohnvelasco/roberta-tagalog-base-cohfie-v1"
    OUTPUT_DIR = "models/roberta_tagalog_nsfw"
    ONNX_PATH = "models/roberta_tagalog_nsfw_model.onnx"
    TFLITE_PATH = "models/roberta_tagalog_nsfw_model.tflite"

    # Create output directories
    os.makedirs("models", exist_ok=True)
    os.makedirs("metrics", exist_ok=True)

    print(f"Using model: {MODEL_NAME}")
    print(f"Output directory: {OUTPUT_DIR}")

    # Step 1: Load dataset
    df = load_dataset()

    # Step 2: Split dataset
    X_train, X_val, y_train, y_val = split_dataset(df)

    # Step 3: Create tokenized datasets
    train_dataset, val_dataset, tokenizer = create_tokenized_datasets(
        X_train, X_val, y_train, y_val, MODEL_NAME
    )

    # Step 4: Train model
    trainer, model = train_model(
        train_dataset, val_dataset, tokenizer, MODEL_NAME, OUTPUT_DIR
    )

    # Step 5: Evaluate model
    accuracy, report = evaluate_model(trainer, X_val, y_val, OUTPUT_DIR)

    # Step 6: Export to ONNX
    onnx_success = export_to_onnx(OUTPUT_DIR, ONNX_PATH)

    # Step 7: Export to TFLite
    tflite_success = export_to_tflite_from_pt(OUTPUT_DIR, TFLITE_PATH)

    # Final output
    print("\n" + "="*70)
    print("RoBERTa Tagalog NSFW Detection Training Complete!")

    if onnx_success:
        print(f"✅ ONNX model: {ONNX_PATH}")
    else:
        print("❌ ONNX export: FAILED")

    if tflite_success:
        print(f"✅ TFLite model: {TFLITE_PATH}")
    else:
        print("❌ TFLite export: FAILED")

    print(f"\nModel checkpoints: {OUTPUT_DIR}")
    print(f"Metrics: metrics/metrics.txt")
    print(f"Final validation accuracy: {accuracy:.4f}")

# INFERENCE

In [None]:
def test_inference(model_dir, test_texts=None):
    if test_texts is None:
        test_texts = [
            "Magandang umaga sa lahat",  # Safe Tagalog
            "Good morning everyone",     # Safe English
            "inappropriate example"      # NSFW placeholder
        ]

    print("\nTesting trained RoBERTa Tagalog model...")

    try:
        # Load model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)

        model.eval()

        for text in test_texts:
            # Tokenize
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=256,
                padding=True
            )

            # Predict
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                predicted_class = torch.argmax(predictions, dim=-1).item()
                confidence = predictions[0][predicted_class].item()

            # Map predictions to labels
            label = "Safe" if predicted_class == 0 else "NSFW"
            safe_prob = predictions[0][0].item()
            nsfw_prob = predictions[0][1].item()

            print(f"Text: '{text}' -> {label} (confidence: {confidence:.4f})")
            print(f"  Probabilities: Safe={safe_prob:.4f}, NSFW={nsfw_prob:.4f}")
            print()

    except Exception as e:
        print(f"Inference test failed: {e}")
        import traceback
        traceback.print_exc()

# PROGRAM EXECUTION

In [None]:
if __name__ == "__main__":
    try:
        # Run the main pipeline
        main()

        # Optional: Test inference
        test_inference("models/roberta_tagalog_nsfw")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user.")
    except Exception as e:
        print(f"Error during execution: {e}")
        import traceback
        traceback.print_exc()

    print("\nProgram execution completed.")

Starting RoBERTa Tagalog NSFW Detection Model Training Pipeline
Using model: danjohnvelasco/roberta-tagalog-base-cohfie-v1
Output directory: models/roberta_tagalog_nsfw
Loading dataset from dataset.csv...
Dataset loaded successfully. Shape: (50, 2)
Dataset validation complete. Clean shape: (50, 2)
Label distribution:
label
0    30
1    20
Name: count, dtype: int64
Splitting dataset: 20.0% for validation...
Training set: 40 samples
Validation set: 10 samples
Loading RoBERTa Tagalog tokenizer and creating tokenized datasets...


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenization complete!
Initializing RoBERTa Tagalog model for NSFW detection training...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at danjohnvelasco/roberta-tagalog-base-cohfie-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.698926,0.6
2,0.682600,0.654199,0.7
3,0.682600,0.586841,0.8


Saving model to models/roberta_tagalog_nsfw...
Evaluating model performance...


Traceback (most recent call last):
  File "/tmp/ipython-input-2995707972.py", line 24, in evaluate_model
    preds = np.array(preds)
            ^^^^^^^^^^^^^^^
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Debug: Predictions structure type: <class 'transformers.trainer_utils.PredictionOutput'>
Debug: predictions shape: not array
Error in evaluation: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.
Attempting fallback evaluation...
Using fallback evaluation method...
Metrics saved to metrics/metrics.txt
Validation Accuracy: 0.8000
Exporting RoBERTa Tagalog model to ONNX format...


  torch.onnx.export(
  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


ONNX model exported to: models/roberta_tagalog_nsfw_model.onnx
ONNX model size: 416.44 MB
Converting RoBERTa Tagalog PyTorch model to TensorFlow...


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Saved intermediate TensorFlow model to models/roberta_tagalog_nsfw/tf_saved_model
TFLite model successfully exported to: models/roberta_tagalog_nsfw_model.tflite
TFLite model size: 208.35 MB

RoBERTa Tagalog NSFW Detection Training Complete!
✅ ONNX model: models/roberta_tagalog_nsfw_model.onnx
✅ TFLite model: models/roberta_tagalog_nsfw_model.tflite

Model checkpoints: models/roberta_tagalog_nsfw
Metrics: metrics/metrics.txt
Final validation accuracy: 0.8000

Testing trained RoBERTa Tagalog model...
Text: 'Magandang umaga sa lahat' -> NSFW (confidence: 0.5137)
  Probabilities: Safe=0.4863, NSFW=0.5137

Text: 'Good morning everyone' -> NSFW (confidence: 0.5226)
  Probabilities: Safe=0.4774, NSFW=0.5226

Text: 'inappropriate example' -> NSFW (confidence: 0.6637)
  Probabilities: Safe=0.3363, NSFW=0.6637


Program execution completed.


In [None]:
# Create zip of entire content folder
!zip -r /content/colab_content.zip /content/

# Download the zip
from google.colab import files
files.download('/content/colab_content.zip')

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: content/.config/.last_update_check.json (deflated 22%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2025.09.16/ (stored 0%)
  adding: content/.config/logs/2025.09.16/13.39.51.530260.log (deflated 92%)
  adding: content/.config/logs/2025.09.16/13.40.25.424362.log (deflated 86%)
  adding: content/.config/logs/2025.09.16/13.40.3

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>