In [2]:
!pip install transformers torch gradio plotly pandas

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (

In [1]:
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import gradio as gr
import pandas as pd
import plotly.express as px
from datetime import datetime
import json

In [2]:
class ContentModerator:
    def __init__(self):
        # Initializing models
        self.toxicity_model = pipeline("text-classification", model="unitary/multilingual-toxic-xlm-roberta")
        self.emotion_model = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=3)
        self.translator = pipeline("translation", model="facebook/mbart-large-50-many-to-many-mmt")
        self.sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased")
        self.ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")


        self.lang_codes = {
            "en": "en_XX",
            "es": "es_XX",
            "fr": "fr_XX",
            "de": "de_DE",
            "hi": "hi_IN"
        }

        self.history = []

    def analyze_content(self, text, source_lang):
        # Store original text
        original_text = text

        # Translate to English if not in English
        if source_lang != "en":
            source_code = self.lang_codes.get(source_lang, "en_XX")
            translation = self.translator(text, src_lang=source_code, tgt_lang="en_XX")[0]['translation_text']
            text = translation

        # Tokenize and split text into chunks
        tokenizer = self.toxicity_model.tokenizer
        max_length = 512
        tokens = tokenizer.encode(text, add_special_tokens=False)
        chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]

        # Initialize variables for aggregation
        toxicity_scores = []
        emotion_results = []
        sentiment_scores = []
        entity_results = []

        for chunk_tokens in chunks:
            # Decode tokens back to text for processing, with a length limit
            chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)

            # Run the models with truncation to enforce token limits
            toxicity_result = self.toxicity_model(chunk_text, max_length=max_length, truncation=True)[0]
            emotions = self.emotion_model(chunk_text, max_length=max_length, truncation=True)
            sentiment_result = self.sentiment_model(chunk_text, max_length=max_length, truncation=True)[0]

            # Truncate input text for NER
            ner_tokens = tokenizer(chunk_text, truncation=True, max_length=max_length, return_tensors="pt")
            truncated_chunk_text = tokenizer.decode(ner_tokens["input_ids"][0], skip_special_tokens=True)
            entities = self.ner_model(truncated_chunk_text)

            # Collect results for aggregation
            toxicity_scores.append(toxicity_result["score"])
            emotion_results.extend([{"emotion": emotion[0]["label"], "score": round(emotion[0]["score"], 3)} for emotion in emotions if emotion])
            sentiment_scores.append(sentiment_result["score"])
            entity_results.extend([{"entity": ent["entity"], "word": ent["word"]} for ent in entities])

        # Aggregate results
        average_toxicity = sum(toxicity_scores) / len(toxicity_scores)
        average_sentiment = sum(sentiment_scores) / len(sentiment_scores)

        result = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "original_text": original_text,
            "translated_text": text if source_lang != "en" else None,
            "toxicity": {
                "label": "TOXIC" if average_toxicity > 0.5 else "NON-TOXIC",
                "score": round(average_toxicity, 3)
            },
            "emotions": emotion_results,
            "sentiment": {
                "label": "POSITIVE" if average_sentiment > 0.5 else "NEGATIVE",
                "score": round(average_sentiment, 3)
            },
            "entities": entity_results
        }

        # Add to history
        self.history.append(result)

        return result

    def get_analytics(self):
        if not self.history:
            return None

        # Convert history into a DataFrame
        df = pd.DataFrame(self.history)

        # Extract the toxicity score and sentiment score from nested dictionaries
        df["toxicity_score"] = df["toxicity"].apply(lambda x: x["score"] if isinstance(x, dict) else None)
        df["sentiment_score"] = df["sentiment"].apply(lambda x: x["score"] if isinstance(x, dict) else None)
        df["sentiment_label"] = df["sentiment"].apply(lambda x: x["label"] if isinstance(x, dict) else None)

        # Toxicity trends
        toxicity_trend = px.line(df, x="timestamp", y="toxicity_score", title="Toxicity Score Over Time")

        # Emotion distribution
        emotions_df = pd.concat([pd.DataFrame(x["emotions"]) for x in self.history])
        emotion_dist = px.bar(emotions_df, x="emotion", y="score", title="Emotion Distribution")

        # Sentiment trends
        sentiment_trend = px.line(df, x="timestamp", y="sentiment_score", color="sentiment_label", title="Sentiment Score Over Time")

        # Entity frequency distribution
        entities_df = pd.DataFrame([entity for result in self.history for entity in result["entities"]])
        if not entities_df.empty:
            entity_dist = px.bar(entities_df, x="word", color="entity", title="Entity Frequency Distribution")
        else:
            entity_dist = None

        return {
            "toxicity_trend": toxicity_trend,
            "emotion_distribution": emotion_dist,
            "sentiment_trend": sentiment_trend,
            "entity_distribution": entity_dist
        }

In [3]:
# Gradio Interface
def create_gradio_interface():
    moderator = ContentModerator()

    def process_input(text, language):
        result = moderator.analyze_content(text, language)
        analytics = moderator.get_analytics()

        # Format output
        output = f"""
        Analysis Results:
        ----------------
        Toxicity: {result['toxicity']['label']} ({result['toxicity']['score']})

        Top Emotions:
        {', '.join([f"{e['emotion']}: {e['score']}" for e in result['emotions']])}

        Sentiment: {result['sentiment']['label']} ({result['sentiment']['score']})

        Entities: {', '.join([f"{entity['word']} ({entity['entity']})" for entity in result['entities']])}

        {f'Translated Text: {result["translated_text"]}' if result["translated_text"] else ''}
        """

        return (
            output,
            analytics["toxicity_trend"] if analytics else None,
            analytics["emotion_distribution"] if analytics else None,
            analytics["sentiment_trend"] if analytics else None,
            analytics["entity_distribution"] if analytics else None
        )

    # Create Gradio interface
    iface = gr.Interface(
        fn=process_input,
        inputs=[
            gr.Textbox(label="Enter Text to Analyze"),
            gr.Dropdown(choices=["en", "es", "fr", "de", "hi"], label="Source Language")
        ],
        outputs=[
            gr.Textbox(label="Analysis Results"),
            gr.Plot(label="Toxicity Trend"),
            gr.Plot(label="Emotion Distribution"),
            gr.Plot(label="Sentiment Trend"),
            gr.Plot(label="Entity Frequency Distribution")
        ],
        title="Enhanced Multilingual Content Moderator with Sentiment, Emotion, and Entity Analysis",
        description="Analyze content for toxicity, emotions, sentiment, and contextual entities across multiple languages"
    )

    return iface

In [None]:
# Launch the application
if __name__ == "__main__":
    iface = create_gradio_interface()
    iface.launch(share=True, debug=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/211 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://60e22dedb3a2fe7281.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
