DistilXLM-RoBERTa for NSFW word detection and export to ONNX/TFLite

In [31]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import tempfile
import subprocess


In [32]:
import sys

if sys.platform.startswith('win'):
    import locale
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

In [33]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


IMPORT ADDITIONAL SETUP


In [34]:
# from optimum.onnxruntime import ORTModelForSequenceClassification
# from optimum.onnxruntime.configuration import OptimizationConfig

In [35]:
def create_sample_dataset():
    sample_data = {
        'text': [
            # Safe words/phrases
            'hello', 'world', 'computer', 'programming', 'science', 'education',
            'family', 'friendship', 'learning', 'knowledge', 'book', 'music',
            'art', 'nature', 'technology', 'innovation', 'creativity', 'peace',
            'happiness', 'success', 'achievement', 'progress', 'development',
            'community', 'cooperation', 'collaboration', 'respect', 'kindness',
            # NSFW words (examples - replace with actual dataset)
            'yawa', 'gago', 'shit', 'pakyu',
            'puta', 'pisti', 'puki', 'buang',
            'motherfucker', 'bitch', 'cunt', 'ulol',
            'fuck', 'gaga', 'tanga', 'dick',
            'fucker', 'putangina', 'bobo', 'asshole'
        ],
        'labels': [0] * 28 + [1] * 20  # 28 safe, 20 nsfw
    }
    
    df = pd.DataFrame(sample_data)
    df.to_csv('dataset.csv', index=False)
    print("Sample dataset created and saved as 'dataset.csv'")
    return df

In [36]:
def load_dataset(csv_path='dataset.csv'):
    print(f"Loading dataset from {csv_path}...")
    
    try:
        df = pd.read_csv(csv_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")
        
        required_cols = ['text', 'labels']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        df = df.dropna(subset=['text', 'labels'])
        df['text'] = df['text'].astype(str)
        df['labels'] = df['labels'].astype(int)
        
        unique_labels = df['labels'].unique()
        if not all(labels in [0, 1] for labels in unique_labels):
            raise ValueError("Labels must be 0 (safe) or 1 (nsfw)")
        
        print(f"Dataset validation complete. Clean shape: {df.shape}")
        print(f"Labels distribution:\n{df['labels'].value_counts()}")
        
        return df
    
    except FileNotFoundError:
        print(f"Error: Dataset file '{csv_path}' not found!")
        print("Creating a sample dataset for demonstration...")
        return create_sample_dataset()
    
def split_dataset(df, test_size=0.2, random_state=42):
    print(f"Splitting dataset: {test_size*100}% for validation...")
    
    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(),
        df['labels'].tolist(),
        test_size=test_size,
        random_state=random_state,
        stratify=df['labels']
    )
    
    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")
    
    return X_train, X_val, y_train, y_val

TOKENIZATION AND PREPROCESSING

In [37]:
def create_tokenized_datasets(X_train, X_val, y_train, y_val, model_name):
    print("Loading tokenizer and creating tokenized datasets...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    train_dataset = Dataset.from_dict({
        'text': X_train,
        'labels': y_train
    })
    
    val_dataset = Dataset.from_dict({
        'text': X_val,
        'labels': y_val
    })
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=False,  
            max_length=128  
        )
    
    train_tokenized = train_dataset.map(tokenize_function, batched=True)
    val_tokenized = val_dataset.map(tokenize_function, batched=True)
    
    print("Tokenization complete!")
    return train_tokenized, val_tokenized, tokenizer

MODEL TRAINING

In [38]:
def train_model(train_dataset, val_dataset, tokenizer, model_name, output_dir):
    print("Initializing model for training...")
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label={0: "SAFE", 1: "NSFW"},
        label2id={"SAFE": 0, "NSFW": 1},
    )
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to=[],  
        seed=42,
        dataloader_num_workers=0,  
        remove_unused_columns=True
    )
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        return {
            'accuracy': accuracy_score(labels, predictions),
        }
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    print("Starting training...")
    trainer.train()
    
    print(f"Saving model to {output_dir}...")
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)
    
    return trainer, model

MODEL EVALUATION

In [39]:
def evaluate_model(trainer, y_val, output_dir):
    print("Evaluating model performance...")

    eval_results = trainer.evaluate()
    
    predictions = trainer.predict(trainer.eval_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)

    from sklearn.metrics import classification_report, accuracy_score
    
    report = classification_report(
        y_val, y_pred, target_names=["SAFE", "NSFW"], digits=4
    )

    accuracy = accuracy_score(y_val, y_pred)

    metrics_text = f"""NSFW Detection Model Evaluation Results
    {'='*50}

    Accuracy: {accuracy:.4f}

    Classification Report:
    {report}

    Training Results:
    {'-'*30}
    """

    for key, value in eval_results.items():
        metrics_text += f"{key}: {value:.4f}\n"

    os.makedirs("metrics", exist_ok=True)
    metrics_path = "metrics/metrics.txt"

    with open(metrics_path, "w", encoding='utf-8') as f:  
        f.write(metrics_text)

    print(f"Metrics saved to {metrics_path}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return accuracy, report

ONNX EXPORT

In [40]:
# def export_to_onnx(model_dir, onnx_path):
#     print("Exporting model to ONNX format...")

#     try:
#         from optimum.onnxruntime import ORTModelForSequenceClassification
#         from optimum.exporters.onnx import main_export
#         from pathlib import Path
        
#         onnx_dir = os.path.dirname(onnx_path)
#         os.makedirs(onnx_dir, exist_ok=True)

#         try:
#             onnx_model = ORTModelForSequenceClassification.from_pretrained(
#                 model_dir, 
#                 export=True,
#                 use_cache=False  
#             )
#             onnx_model.save_pretrained(onnx_dir)
            
#             onnx_files = list(Path(onnx_dir).glob("*.onnx"))
#             if onnx_files:
#                 current_onnx = onnx_files[0]
#                 target_path = Path(onnx_path)
#                 if current_onnx != target_path:
#                     if target_path.exists():
#                         target_path.unlink()
#                     current_onnx.rename(target_path)
                
#                 print(f"ONNX model exported to: {onnx_path}")
#                 return True
                
#         except Exception as e1:
#             print(f"Method 1 failed: {e1}, trying alternative...")
            
#             from transformers import AutoTokenizer, AutoModelForSequenceClassification
#             import torch
            
#             model = AutoModelForSequenceClassification.from_pretrained(model_dir)
#             tokenizer = AutoTokenizer.from_pretrained(model_dir)
            
#             dummy_text = "sample input text"
#             dummy_input = tokenizer(
#                 dummy_text,
#                 return_tensors="pt",
#                 max_length=128,
#                 padding="max_length",
#                 truncation=True,
#             )
            
#             torch.onnx.export(
#                 model,
#                 (dummy_input['input_ids'], dummy_input['attention_mask']),
#                 onnx_path,
#                 export_params=True,
#                 opset_version=14,  # Compatible with your onnxruntime 1.19.0
#                 do_constant_folding=True,
#                 input_names=['input_ids', 'attention_mask'],
#                 output_names=['logits'],
#                 dynamic_axes={
#                     'input_ids': {0: 'batch_size', 1: 'sequence'},
#                     'attention_mask': {0: 'batch_size', 1: 'sequence'},
#                     'logits': {0: 'batch_size'}
#                 },
#                 verbose=True
#             )
            
#             print(f"ONNX model exported to: {onnx_path}")
#             return True

#     except ImportError as e:
#         print(f"Import error: {e}")
#         return False
#     except Exception as e:
#         print(f"ONNX export failed: {e}")
#         import traceback
#         traceback.print_exc()
#         return False

TENSORFLOW LITE EXPORT

In [41]:
def export_tflite_alternative(model_dir, tflite_path):
    try:
        import tensorflow as tf
        from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
        
        print(f"Using TensorFlow {tf.__version__}")
        
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        tf_model = TFAutoModelForSequenceClassification.from_pretrained(
            model_dir, 
            from_pt=True  # Correct parameter for transformers 4.44.2
        )

        converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
        converter.optimizations = [tf.lite.Optimize.DEFAULT]
        
        def representative_dataset():
            sample_texts = [
                "hello world", "test message", "sample text", "another example",
                "short", "this is a longer text for testing purposes"
            ]
            
            for text in sample_texts:
                inputs = tokenizer(
                    text, 
                    return_tensors="tf", 
                    max_length=128, 
                    padding="max_length",
                    truncation=True
                )
                if 'token_type_ids' in inputs:
                    yield [inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids']]
                else:
                    yield [inputs['input_ids'], inputs['attention_mask']]
                # yield [inputs['input_ids'], inputs['attention_mask']]
        
        converter.representative_dataset = representative_dataset
        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]
        
        tflite_model = converter.convert()

        os.makedirs(os.path.dirname(tflite_path), exist_ok=True)
        with open(tflite_path, "wb") as f:
            f.write(tflite_model)

        print(f"TFLite model exported to: {tflite_path}")
        return True

    except Exception as e:
        print(f"TFLite export failed: {e}")
        import traceback
        traceback.print_exc()
        return False

In [42]:
# def export_to_tflite(model_dir, tflite_path):
#     print("Converting model to TensorFlow Lite...")

#     try:
#         success = export_tflite_alternative(model_dir, tflite_path)
#         if success:
#             return True
            
#         print("Trying ONNX -> TensorFlow -> TFLite conversion...")
        
#         # import onnx
#         # import tensorflow as tf
#         # from onnx_tf.backend import prepare
#         import tempfile
        
#         temp_onnx = os.path.join(tempfile.gettempdir(), "temp_model.onnx")
        
#         from transformers import AutoModelForSequenceClassification, AutoTokenizer
#         import torch
        
#         model = AutoModelForSequenceClassification.from_pretrained(model_dir)
#         tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
#         dummy_input = tokenizer(
#             "sample text",
#             return_tensors="pt",
#             max_length=128,
#             padding="max_length",
#             truncation=True,
#         )
        
#         torch.onnx.export(
#             model,
#             (dummy_input['input_ids'], dummy_input['attention_mask']),
#             temp_onnx,
#             export_params=True,
#             opset_version=12,  # Lower opset for better onnx-tf compatibility
#             do_constant_folding=True,
#             input_names=['input_ids', 'attention_mask'],
#             output_names=['logits']
#         )
        
#         # onnx_model = onnx.load(temp_onnx)
#         # tf_rep = prepare(onnx_model)
        
#         with tempfile.TemporaryDirectory() as temp_dir:
#             saved_model_path = os.path.join(temp_dir, "saved_model")
#             tf_rep.export_graph(saved_model_path)
            
#             converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
#             converter.optimizations = [tf.lite.Optimize.DEFAULT]
            
#             tflite_model = converter.convert()
            
#             os.makedirs(os.path.dirname(tflite_path), exist_ok=True)
#             with open(tflite_path, "wb") as f:
#                 f.write(tflite_model)
            
#         if os.path.exists(temp_onnx):
#             os.remove(temp_onnx)
            
#         print(f"TFLite model exported to: {tflite_path}")
#         return True

#     except Exception as e:
#         print(f"TFLite export failed: {e}")
#         import traceback
#         traceback.print_exc()
#         return False

MAIN TRAINING

In [43]:
def main():
    print("Starting NSFW Detection Model Training Pipeline")
    print("=" * 60)

    MODEL_NAME = "microsoft/xtremedistil-l6-h256-uncased"
    OUTPUT_DIR = "models/exporter-xtremedistil-l6-h256-uncased"
    # ONNX_PATH = "models/exporter_nsfw_model.onnx"
    TFLITE_PATH = "models/exporter_nsfw_model.tflite"

    os.makedirs("models", exist_ok=True)
    os.makedirs("metrics", exist_ok=True)

    df = load_dataset()

    X_train, X_val, y_train, y_val = split_dataset(df)

    train_dataset, val_dataset, tokenizer = create_tokenized_datasets(
        X_train, X_val, y_train, y_val, MODEL_NAME
    )

    trainer, model = train_model(
        train_dataset, val_dataset, tokenizer, MODEL_NAME, OUTPUT_DIR
    )

    accuracy, report = evaluate_model(trainer, y_val, OUTPUT_DIR)

    # onnx_success = export_to_onnx(OUTPUT_DIR, ONNX_PATH)

    # tflite_success = False
    # if onnx_success:
    #     tflite_success = export_to_tflite(OUTPUT_DIR, TFLITE_PATH)
    # else:
    tflite_success = export_tflite_alternative(OUTPUT_DIR, TFLITE_PATH)

    print("\n" + "=" * 60)
    print("Training and Export Summary:")
    print(f" Model trained successfully - Accuracy: {accuracy:.4f}")
    # print(f"{'' if onnx_success else ''} ONNX export: {'SUCCESS' if onnx_success else 'FAILED'}")
    print(f"{'' if tflite_success else ''} TFLite export: {'SUCCESS' if tflite_success else 'FAILED'}")
    print(f"\nOutput files:")
    print(f"  - Model: {OUTPUT_DIR}")
    print(f"  - Metrics: metrics/metrics.txt")
    # if onnx_success:
    #     print(f"  - ONNX: {ONNX_PATH}")
    if tflite_success:
        print(f"  - TFLite: {TFLITE_PATH}")

Sample CHECK

In [44]:
def test_inference(model_dir, test_texts=None):
    if test_texts is None:
        test_texts = ["hello world", "putang ina"]
    
    print("\nTesting trained model...")
    
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        model.eval()
        
        for text in test_texts:
            inputs = tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=128,
                padding=True
            )
            
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                predicted_class = torch.argmax(predictions, dim=-1).item()
                confidence = predictions[0][predicted_class].item()
            
            labels = "SAFE" if predicted_class == 0 else "NSFW"
            print(f"Text: '{text}' -> {labels} (confidence: {confidence:.4f})")
    
    except Exception as e:
        print(f"Inference test failed: {e}")

PROGRAM 

In [45]:
if __name__ == "__main__":
    try:
        main()
        
        test_inference("models/exporter-xtremedistil-l6-h256-uncased")
        
    except KeyboardInterrupt:
        print("\nTraining interrupted by user.")
    except Exception as e:
        print(f"Error during execution: {e}")
        import traceback
        traceback.print_exc()

Starting NSFW Detection Model Training Pipeline
Loading dataset from dataset.csv...
Dataset loaded successfully. Shape: (48, 2)
Dataset validation complete. Clean shape: (48, 2)
Labels distribution:
labels
0    28
1    20
Name: count, dtype: int64
Splitting dataset: 20.0% for validation...
Training set: 38 samples
Validation set: 10 samples
Loading tokenizer and creating tokenized datasets...


Map: 100%|██████████| 38/38 [00:00<00:00, 5115.83 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 2606.78 examples/s]


Tokenization complete!
Initializing model for training...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.689414,0.8
2,No log,0.686026,0.9
3,No log,0.682627,1.0
4,0.688500,0.680047,0.9
5,0.688500,0.67897,0.9




Saving model to models/exporter-xtremedistil-l6-h256-uncased...
Evaluating model performance...


Metrics saved to metrics/metrics.txt
Validation Accuracy: 1.0000
Using TensorFlow 2.20.0


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


INFO:tensorflow:Assets written to: /tmp/tmpp07_w5xx/assets


INFO:tensorflow:Assets written to: /tmp/tmpp07_w5xx/assets
  
W0000 00:00:1757003892.370870  101265 tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
W0000 00:00:1757003892.370899  101265 tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2025-09-05 00:38:12.371093: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /tmp/tmpp07_w5xx
2025-09-05 00:38:12.387072: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-09-05 00:38:12.387099: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /tmp/tmpp07_w5xx
2025-09-05 00:38:12.502565: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-09-05 00:38:12.771321: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /tmp/tmpp07_w5xx
2025-09-05 00:38:12.917062: I tensorflow/cc/saved_model/loader.cc:471] SavedModel load for tags { serve }; Status: success: OK. Took 545

TFLite model exported to: models/exporter_nsfw_model.tflite

Training and Export Summary:
 Model trained successfully - Accuracy: 1.0000
 TFLite export: SUCCESS

Output files:
  - Model: models/exporter-xtremedistil-l6-h256-uncased
  - Metrics: metrics/metrics.txt
  - TFLite: models/exporter_nsfw_model.tflite

Testing trained model...
Text: 'hello world' -> SAFE (confidence: 0.5023)
Text: 'putang ina' -> NSFW (confidence: 0.5048)


In [46]:
import transformers

print(transformers.__version__)

4.56.0


In [47]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.56.0', '1.10.1')