DistilXLM-RoBERTa for NSFW word detection and export to ONNX/TFLite

In [43]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import tempfile
import subprocess

In [44]:
import sys

# Ensure proper encoding on Windows
if sys.platform.startswith('win'):
    import locale
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

IMPORT ADDITIONAL SETUP


In [45]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from optimum.onnxruntime.configuration import OptimizationConfig

In [46]:
def load_dataset(csv_path='dataset.csv'):
    """Load and validate the NSFW dataset"""
    print(f"Loading dataset from {csv_path}...")
    
    try:
        df = pd.read_csv(csv_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")
        
        # Validate required columns
        required_cols = ['text', 'labels']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        # Clean and validate data
        df = df.dropna(subset=['text', 'labels'])
        df['text'] = df['text'].astype(str)
        df['labels'] = df['labels'].astype(int)
        
        # Validate labels are binary (0 or 1)
        unique_labels = df['labels'].unique()
        if not all(labels in [0, 1] for labels in unique_labels):
            raise ValueError("Labels must be 0 (safe) or 1 (nsfw)")
        
        print(f"Dataset validation complete. Clean shape: {df.shape}")
        print(f"Labels distribution:\n{df['labels'].value_counts()}")
        
        return df
    
    except FileNotFoundError:
        print(f"Error: Dataset file '{csv_path}' not found!")
        print("Creating a sample dataset for demonstration...")
        return create_sample_dataset()

In [47]:
def create_sample_dataset():
    """Create a sample dataset for demonstration purposes"""
    sample_data = {
        'text': [
            # Safe words/phrases
            'hello', 'world', 'computer', 'programming', 'science', 'education',
            'family', 'friendship', 'learning', 'knowledge', 'book', 'music',
            'art', 'nature', 'technology', 'innovation', 'creativity', 'peace',
            'happiness', 'success', 'achievement', 'progress', 'development',
            'community', 'cooperation', 'collaboration', 'respect', 'kindness',
            # NSFW words (examples - replace with actual dataset)
            'yawa', 'gago', 'shit', 'pakyu',
            'puta', 'pisti', 'puki', 'buang',
            'motherfucker', 'bitch', 'cunt', 'ulol',
            'fuck', 'gaga', 'tanga', 'dick',
            'fucker', 'putangina', 'bobo', 'asshole'
        ],
        'labels': [0] * 28 + [1] * 20  # 28 safe, 20 nsfw
    }
    
    df = pd.DataFrame(sample_data)
    df.to_csv('dataset.csv', index=False)
    print("Sample dataset created and saved as 'dataset.csv'")
    return df

In [48]:
def split_dataset(df, test_size=0.2, random_state=42):
    """Split dataset into training and validation sets"""
    print(f"Splitting dataset: {test_size*100}% for validation...")
    
    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(),
        df['labels'].tolist(),
        test_size=test_size,
        random_state=random_state,
        stratify=df['labels']
    )
    
    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")
    
    return X_train, X_val, y_train, y_val

TOKENIZATION AND PREPROCESSING

In [49]:
def create_tokenized_datasets(X_train, X_val, y_train, y_val, model_name):
    """Tokenize the datasets using the model tokenizer"""
    print("Loading tokenizer and creating tokenized datasets...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Create datasets
    train_dataset = Dataset.from_dict({
        'text': X_train,
        'labels': y_train
    })
    
    val_dataset = Dataset.from_dict({
        'text': X_val,
        'labels': y_val
    })
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=False,  # Will be handled by data collator
            max_length=128  # Reasonable for word/short phrase detection
        )
    
    # Tokenize datasets
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    val_tokenized = val_dataset.map(tokenize_function, batched=True)
    
    print("Tokenization complete!")
    return train_tokenized, val_tokenized, tokenizer

MODEL TRAINING

In [50]:
def train_model(train_dataset, val_dataset, tokenizer, model_name, output_dir):
    """Train the DistilXLM-RoBERTa model"""
    print("Initializing model for training...")
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label={0: "SAFE", 1: "NSFW"},
        label2id={"SAFE": 0, "NSFW": 1}
    )
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to=[],  # Disable wandb/tensorboard
        seed=42,
        dataloader_num_workers=0,  # Avoid multiprocessing issues
        remove_unused_columns=True
    )
    
    def compute_metrics(eval_pred):
        """Compute metrics for evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        return {
            'accuracy': accuracy_score(labels, predictions),
        }
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    print("Starting training...")
    trainer.train()
    
    # Save the best model
    print(f"Saving model to {output_dir}...")
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)
    
    return trainer, model

MODEL EVALUATION

In [51]:
def evaluate_model(trainer, y_val, output_dir):
    """Evaluate the trained model and save metrics"""
    print("Evaluating model performance...")

    # Get evaluation results
    eval_results = trainer.evaluate()
    
    # Get detailed predictions for classification report
    predictions = trainer.predict(trainer.eval_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)

    # Generate classification report
    from sklearn.metrics import classification_report, accuracy_score
    
    report = classification_report(
        y_val, y_pred, target_names=["SAFE", "NSFW"], digits=4
    )

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    # Prepare metrics text
    metrics_text = f"""NSFW Detection Model Evaluation Results
{'='*50}

Accuracy: {accuracy:.4f}

Classification Report:
{report}

Training Results:
{'-'*30}
"""

    for key, value in eval_results.items():
        metrics_text += f"{key}: {value:.4f}\n"

    # Save metrics
    os.makedirs("metrics", exist_ok=True)
    metrics_path = "metrics/metrics.txt"

    with open(metrics_path, "w", encoding='utf-8') as f:  # Add encoding for Windows
        f.write(metrics_text)

    print(f"Metrics saved to {metrics_path}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return accuracy, report

ONNX EXPORT

In [52]:
def export_to_onnx(model_dir, onnx_path):
    """Export the trained model to ONNX format using Optimum"""
    print("Exporting model to ONNX format...")

    try:
        from optimum.onnxruntime import ORTModelForSequenceClassification
        from optimum.exporters.onnx import main_export
        from pathlib import Path
        
        # Create ONNX directory
        onnx_dir = os.path.dirname(onnx_path)
        os.makedirs(onnx_dir, exist_ok=True)

        # Method 1: Use ORTModelForSequenceClassification
        try:
            onnx_model = ORTModelForSequenceClassification.from_pretrained(
                model_dir, 
                export=True,
                use_cache=False  # Avoid caching issues
            )
            onnx_model.save_pretrained(onnx_dir)
            
            # Find and rename the ONNX file
            onnx_files = list(Path(onnx_dir).glob("*.onnx"))
            if onnx_files:
                current_onnx = onnx_files[0]
                target_path = Path(onnx_path)
                if current_onnx != target_path:
                    if target_path.exists():
                        target_path.unlink()
                    current_onnx.rename(target_path)
                
                print(f"ONNX model exported to: {onnx_path}")
                return True
                
        except Exception as e1:
            print(f"Method 1 failed: {e1}, trying alternative...")
            
            # Method 2: Direct torch.onnx export
            from transformers import AutoTokenizer, AutoModelForSequenceClassification
            import torch
            
            model = AutoModelForSequenceClassification.from_pretrained(model_dir)
            tokenizer = AutoTokenizer.from_pretrained(model_dir)
            
            # Create dummy input
            dummy_text = "sample input text"
            dummy_input = tokenizer(
                dummy_text,
                return_tensors="pt",
                max_length=128,
                padding="max_length",
                truncation=True,
            )
            
            # Export to ONNX
            torch.onnx.export(
                model,
                (dummy_input['input_ids'], dummy_input['attention_mask']),
                onnx_path,
                export_params=True,
                opset_version=14,  # Compatible with your onnxruntime 1.19.0
                do_constant_folding=True,
                input_names=['input_ids', 'attention_mask'],
                output_names=['logits'],
                dynamic_axes={
                    'input_ids': {0: 'batch_size', 1: 'sequence'},
                    'attention_mask': {0: 'batch_size', 1: 'sequence'},
                    'logits': {0: 'batch_size'}
                },
                verbose=True
            )
            
            print(f"ONNX model exported to: {onnx_path}")
            return True

    except ImportError as e:
        print(f"Import error: {e}")
        return False
    except Exception as e:
        print(f"ONNX export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

TENSORFLOW LITE EXPORT

In [None]:
def export_tflite_alternative(model_dir, tflite_path):
    """Alternative TFLite export method using TensorFlow 2.17"""
    try:
        import tensorflow as tf
        from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
        
        print(f"Using TensorFlow {tf.__version__}")
        
        # Load tokenizer to get proper input specs
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        # Load as TensorFlow model from PyTorch checkpoint
        tf_model = TFAutoModelForSequenceClassification.from_pretrained(
            model_dir, 
            from_pt=True  # Correct parameter for transformers 4.44.2
        )

        # Create converter
        converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        
        # Representative dataset for quantization
        def representative_dataset():
            sample_texts = [
                "hello world", "test message", "sample text", "another example",
                "short", "this is a longer text for testing purposes"
            ]
            
            for text in sample_texts:
                # Tokenize properly
                inputs = tokenizer(
                    text, 
                    return_tensors="tf", 
                    max_length=128, 
                    padding="max_length",
                    truncation=True
                )
                yield [inputs['input_ids'], inputs['attention_mask']]
        
        converter.representative_dataset = representative_dataset
        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]
        
        # Convert
        tflite_model = converter.convert()

        # Save TFLite model
        os.makedirs(os.path.dirname(tflite_path), exist_ok=True)
        with open(tflite_path, "wb") as f:
            f.write(tflite_model)

        print(f"TFLite model exported to: {tflite_path}")
        return True

    except Exception as e:
        print(f"TFLite export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

In [59]:
def export_to_tflite(model_dir, tflite_path):
    """Convert model to TensorFlow Lite using your installed packages"""
    print("Converting model to TensorFlow Lite...")

    try:
        # First try the direct TF method
        success = export_tflite_alternative(model_dir, tflite_path)
        if success:
            return True
            
        # If that fails, try ONNX -> TF -> TFLite pipeline
        print("Trying ONNX -> TensorFlow -> TFLite conversion...")
        
        import onnx
        import tensorflow as tf
        from onnx_tf.backend import prepare
        import tempfile
        
        # Create temporary ONNX file
        temp_onnx = os.path.join(tempfile.gettempdir(), "temp_model.onnx")
        
        # Export to ONNX first
        from transformers import AutoModelForSequenceClassification, AutoTokenizer
        import torch
        
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        dummy_input = tokenizer(
            "sample text",
            return_tensors="pt",
            max_length=128,
            padding="max_length",
            truncation=True,
        )
        
        torch.onnx.export(
            model,
            (dummy_input['input_ids'], dummy_input['attention_mask']),
            temp_onnx,
            export_params=True,
            opset_version=12,  # Lower opset for better onnx-tf compatibility
            do_constant_folding=True,
            input_names=['input_ids', 'attention_mask'],
            output_names=['logits']
        )
        
        # Load ONNX model and convert to TF
        onnx_model = onnx.load(temp_onnx)
        tf_rep = prepare(onnx_model)
        
        # Convert to TFLite
        with tempfile.TemporaryDirectory() as temp_dir:
            saved_model_path = os.path.join(temp_dir, "saved_model")
            tf_rep.export_graph(saved_model_path)
            
            converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            
            tflite_model = converter.convert()
            
            # Save TFLite model
            os.makedirs(os.path.dirname(tflite_path), exist_ok=True)
            with open(tflite_path, "wb") as f:
                f.write(tflite_model)
            
        # Cleanup
        if os.path.exists(temp_onnx):
            os.remove(temp_onnx)
            
        print(f"TFLite model exported to: {tflite_path}")
        return True

    except Exception as e:
        print(f"TFLite export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

MAIN TRAINING

In [None]:
def main():
    """Main training and export pipeline"""
    print("Starting NSFW Detection Model Training Pipeline")
    print("=" * 60)

    # Configuration
    MODEL_NAME = "microsoft/xtremedistil-l6-h256-uncased"
    OUTPUT_DIR = "models/xtremedistil-l6-h256-uncased"
    ONNX_PATH = "models/nsfw_model.onnx"
    TFLITE_PATH = "models/nsfw_model.tflite"

    # Create output directories
    os.makedirs("models", exist_ok=True)
    os.makedirs("metrics", exist_ok=True)

    # Step 1: Load dataset
    df = load_dataset()

    # Step 2: Split dataset
    X_train, X_val, y_train, y_val = split_dataset(df)

    # Step 3: Create tokenized datasets
    train_dataset, val_dataset, tokenizer = create_tokenized_datasets(
        X_train, X_val, y_train, y_val, MODEL_NAME
    )

    # Step 4: Train model
    trainer, model = train_model(
        train_dataset, val_dataset, tokenizer, MODEL_NAME, OUTPUT_DIR
    )

    # Step 5: Evaluate model (corrected function call)
    accuracy, report = evaluate_model(trainer, y_val, OUTPUT_DIR)

    # Step 6: Export to ONNX
    onnx_success = export_to_onnx(OUTPUT_DIR, ONNX_PATH)

    # Step 7: Export to TensorFlow Lite (pass model_dir correctly)
    tflite_success = False
    if onnx_success:
        tflite_success = export_to_tflite(OUTPUT_DIR, TFLITE_PATH)
    else:
        # Try TFLite export even if ONNX failed
        tflite_success = export_tflite_alternative(OUTPUT_DIR, TFLITE_PATH)

    # Final summary
    print("\n" + "=" * 60)
    print("Training and Export Summary:")
    print(f" Model trained successfully - Accuracy: {accuracy:.4f}")
    print(f"{'' if onnx_success else ''} ONNX export: {'SUCCESS' if onnx_success else 'FAILED'}")
    print(f"{'' if tflite_success else ''} TFLite export: {'SUCCESS' if tflite_success else 'FAILED'}")
    print(f"\nOutput files:")
    print(f"  - Model: {OUTPUT_DIR}")
    print(f"  - Metrics: metrics/metrics.txt")
    if onnx_success:
        print(f"  - ONNX: {ONNX_PATH}")
    if tflite_success:
        print(f"  - TFLite: {TFLITE_PATH}")

Sample CHECK

In [56]:
def test_inference(model_dir, test_texts=None):
    """Test the trained model with sample texts"""
    if test_texts is None:
        test_texts = ["hello world", "putang ina"]
    
    print("\nTesting trained model...")
    
    try:
        # Load model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        model.eval()
        
        for text in test_texts:
            # Tokenize
            inputs = tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=128,
                padding=True
            )
            
            # Predict
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                predicted_class = torch.argmax(predictions, dim=-1).item()
                confidence = predictions[0][predicted_class].item()
            
            labels = "SAFE" if predicted_class == 0 else "NSFW"
            print(f"Text: '{text}' -> {labels} (confidence: {confidence:.4f})")
    
    except Exception as e:
        print(f"Inference test failed: {e}")

PROGRAM 

In [57]:
if __name__ == "__main__":
    try:
        # Run the main pipeline
        main()
        
        # Optional: Test inference
        test_inference("models/xtremedistil-l6-h256-uncased")
        
    except KeyboardInterrupt:
        print("\nTraining interrupted by user.")
    except Exception as e:
        print(f"Error during execution: {e}")
        import traceback
        traceback.print_exc()

Starting NSFW Detection Model Training Pipeline
Loading dataset from dataset.csv...
Error: Dataset file 'dataset.csv' not found!
Creating a sample dataset for demonstration...
Sample dataset created and saved as 'dataset.csv'
Splitting dataset: 20.0% for validation...
Training set: 38 samples
Validation set: 10 samples
Loading tokenizer and creating tokenized datasets...


Map: 100%|██████████| 38/38 [00:00<00:00, 503.59 examples/s]
Map: 100%|██████████| 10/10 [00:00<?, ? examples/s]


Tokenization complete!
Initializing model for training...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


 20%|██        | 3/15 [00:01<00:03,  3.05it/s]
 20%|██        | 3/15 [00:01<00:03,  3.05it/s]

{'eval_loss': 0.6898426413536072, 'eval_accuracy': 0.6, 'eval_runtime': 0.0287, 'eval_samples_per_second': 348.515, 'eval_steps_per_second': 34.851, 'epoch': 1.0}


 40%|████      | 6/15 [00:02<00:03,  2.39it/s]
 40%|████      | 6/15 [00:02<00:03,  2.39it/s]

{'eval_loss': 0.6873841881752014, 'eval_accuracy': 0.6, 'eval_runtime': 0.045, 'eval_samples_per_second': 222.204, 'eval_steps_per_second': 22.22, 'epoch': 2.0}


 60%|██████    | 9/15 [00:03<00:01,  3.02it/s]
 60%|██████    | 9/15 [00:03<00:01,  3.02it/s]

{'eval_loss': 0.6850928068161011, 'eval_accuracy': 0.6, 'eval_runtime': 0.0188, 'eval_samples_per_second': 531.887, 'eval_steps_per_second': 53.189, 'epoch': 3.0}


 67%|██████▋   | 10/15 [00:04<00:02,  2.31it/s]

{'loss': 0.69, 'grad_norm': 0.6976374983787537, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.33}


 80%|████████  | 12/15 [00:04<00:00,  3.06it/s]
 80%|████████  | 12/15 [00:04<00:00,  3.06it/s]

{'eval_loss': 0.6833875775337219, 'eval_accuracy': 0.6, 'eval_runtime': 0.0182, 'eval_samples_per_second': 549.417, 'eval_steps_per_second': 54.942, 'epoch': 4.0}


100%|██████████| 15/15 [00:05<00:00,  3.37it/s]
100%|██████████| 15/15 [00:06<00:00,  3.37it/s]

{'eval_loss': 0.6826910376548767, 'eval_accuracy': 0.6, 'eval_runtime': 0.016, 'eval_samples_per_second': 626.745, 'eval_steps_per_second': 62.675, 'epoch': 5.0}


100%|██████████| 15/15 [00:06<00:00,  2.32it/s]


{'train_runtime': 6.4692, 'train_samples_per_second': 29.37, 'train_steps_per_second': 2.319, 'train_loss': 0.689188273747762, 'epoch': 5.0}
Saving model to models/xtremedistil-l6-h256-uncased...
Evaluating model performance...


100%|██████████| 1/1 [00:00<00:00, 500.39it/s]
100%|██████████| 1/1 [00:00<00:00, 509.26it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Metrics saved to metrics/metrics.txt
Validation Accuracy: 0.6000
Exporting model to ONNX format...
Method 1 failed: ORTModel._from_transformers() got an unexpected keyword argument 'use_cache', trying alternative...
ONNX model exported to: models/nsfw_model.onnx
Converting model to TensorFlow Lite...
Using TensorFlow 2.17.0
TFLite export failed: ('Keyword argument not understood:', 'low_cpu_mem_usage')
Trying ONNX -> TensorFlow -> TFLite conversion...
TFLite export failed: No module named 'keras.src.engine'

Training and Export Summary:
✅ Model trained successfully - Accuracy: 0.6000
✅ ONNX export: SUCCESS
❌ TFLite export: FAILED

Output files:
  - Model: models/xtremedistil-l6-h256-uncased
  - Metrics: metrics/metrics.txt
  - ONNX: models/nsfw_model.onnx

Testing trained model...
Text: 'hello world' -> SAFE (confidence: 0.5052)
Text: 'putang ina' -> SAFE (confidence: 0.5021)


Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_19544\2416132881.py", line 13, in export_tflite_alternative
    tf_model = TFAutoModelForSequenceClassification.from_pretrained(
  File "c:\Users\USER\miniconda3\envs\ThesisBert\lib\site-packages\transformers\models\auto\auto_factory.py", line 564, in from_pretrained
    return model_class.from_pretrained(
  File "c:\Users\USER\miniconda3\envs\ThesisBert\lib\site-packages\transformers\modeling_tf_utils.py", line 2951, in from_pretrained
    model = cls(config, *model_args, **model_kwargs)
  File "c:\Users\USER\miniconda3\envs\ThesisBert\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1701, in __init__
    super().__init__(config, *inputs, **kwargs)
  File "c:\Users\USER\miniconda3\envs\ThesisBert\lib\site-packages\transformers\modeling_tf_utils.py", line 1210, in __init__
    super().__init__(*inputs, **kwargs)
  File "c:\Users\USER\miniconda3\envs\ThesisBert\lib\site-packages\te