# **Sentiment Analysis using BERT**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import re
from typing import List, Dict, Union
import logging
from datetime import datetime

In [None]:
# Set up logging configuration
logging.basicConfig(
    level = logging.INFO,
    format = '%(asctime)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)
# 11:01:2025 - INFO - This is a sample log.

In [None]:
class BinaryBertSentimentAnalyzer:
    """
    A sentiment analysis class using BERT for binary classification (positive/negative).

    This implementation uses a fine-tuned BERT model specifically trained for
    sentiment analysis. It includes proper probability scaling, error handling,
    and batch processing capabilities.
    """

    def __init__(self, model_name: str = 'finiteautomata/bertweet-base-sentiment-analysis'):
        """
        Initialize the sentiment analyzer with a pre-trained model.

        Args:
            model_name: The name of the pre-trained model to use
                       (default: 'finiteautomata/bertweet-base-sentiment-analysis')
        """
        try:
            # Set up device (GPU if available, else CPU)
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            logger.info(f"Using device: {self.device}")

            # Initialize tokenizer and model
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
            self.model = self.model.to(self.device)
            self.model.eval()  # Set model to evaluation mode

            # Record initialization time
            self.initialization_time = datetime.now()
            logger.info("Sentiment analyzer initialized successfully")

        except Exception as e:
            logger.error(f"Error initializing sentiment analyzer: {str(e)}")
            raise

    def preprocess_text(self, text: str) -> str:
        """
        Clean and normalize input text while preserving sentiment-indicating features.

        Args:
            text: Input text string

        Returns:
            Preprocessed text string
        """
        try:
            # Convert None or non-string input to empty string
            if not isinstance(text, str):
                logger.warning(f"Non-string input received: {type(text)}. Converting to string.")
                text = str(text) if text is not None else ""

            # Remove URLs
            text = re.sub(r'https?://\S+|www\.\S+', '', text)

            # Preserve repeated punctuation (e.g., "!!!" or "???")
            text = re.sub(r'([!?.])\1+', r'\1\1\1', text)

            # Replace multiple spaces with single space
            text = re.sub(r'\s+', ' ', text)

            return text.strip()

        except Exception as e:
            logger.error(f"Error in text preprocessing: {str(e)}")
            return ""

    def analyze_sentiment(self, texts: Union[str, List[str]], batch_size: int = 8) -> List[Dict[str, Union[str, float]]]:
        """
        Analyze sentiment for one or more texts.

        Args:
            texts: Single text string or list of text strings
            batch_size: Number of texts to process at once (default: 8)

        Returns:
            List of dictionaries containing sentiment analysis results
        """
        try:
            # Convert single text to list
            if isinstance(texts, str):
                texts = [texts]

            # Validate input
            if not texts:
                logger.warning("Empty input received")
                return []

            results = []

            # Process texts in batches
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                # Preprocess each text in the batch
                batch_texts = [self.preprocess_text(text) for text in batch_texts]

                # Skip empty texts
                if not any(batch_texts):
                    continue

                # Tokenize the batch
                encoded = self.tokenizer.batch_encode_plus(
                    batch_texts,
                    add_special_tokens=True,
                    return_tensors='pt',
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_attention_mask=True
                )

                # Move tensors to device
                input_ids = encoded['input_ids'].to(self.device)
                attention_mask = encoded['attention_mask'].to(self.device)

                # Get model predictions
                with torch.no_grad():
                    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    # Apply softmax with temperature scaling for better probability distribution
                    scores = softmax(outputs.logits / 0.7, dim=1)

                # Process each text's results
                for j, text in enumerate(batch_texts):
                    # Get probabilities and normalize
                    probs = scores[j].cpu().numpy()

                    # Ensure probabilities sum to 1
                    total_prob = probs.sum()
                    if total_prob != 0:
                        probs = probs / total_prob

                    # Get prediction and confidence
                    prediction_idx = int(probs[1] > probs[0])
                    confidence = float(probs[prediction_idx])

                    # Convert to percentages
                    negative_prob = float(probs[0] * 100)
                    positive_prob = float(probs[1] * 100)

                    sentiment = 'positive' if prediction_idx == 1 else 'negative'

                    results.append({
                        'text': texts[i + j],
                        'sentiment': sentiment,
                        'confidence': confidence * 100,
                        'probabilities': {
                            'negative': negative_prob,
                            'positive': positive_prob
                        }
                    })

            return results

        except Exception as e:
            logger.error(f"Error in sentiment analysis: {str(e)}")
            return []

def evaluate_model(analyzer: BinaryBertSentimentAnalyzer, examples: List[Dict[str, str]]) -> None:
    """
    Evaluate the model on a set of examples with known sentiment.

    Args:
        analyzer: Initialized sentiment analyzer
        examples: List of dictionaries containing text and expected sentiment
    """
    try:
        correct = 0
        texts = [ex['text'] for ex in examples]
        results = analyzer.analyze_sentiment(texts)

        print("\nSentiment Analysis Evaluation Results:")
        print("=====================================")

        for i, result in enumerate(results):
            expected = examples[i]['expected']
            predicted = result['sentiment']
            is_correct = expected == predicted

            print(f"\nText: {result['text']}")
            print(f"Expected: {expected}")
            print(f"Predicted: {predicted} (confidence: {result['confidence']:.2f}%)")
            print("Probability Distribution:")
            for sentiment, prob in result['probabilities'].items():
                print(f"  {sentiment}: {prob:.2f}%")

            if is_correct:
                correct += 1
                print("✓ Correct")
            else:
                print("✗ Incorrect")

        accuracy = correct / len(examples)
        print(f"\nOverall Accuracy: {accuracy:.2%}")

    except Exception as e:
        logger.error(f"Error in model evaluation: {str(e)}")

def main():
    """
    Main function demonstrating the usage of the sentiment analyzer.
    """
    try:
        print("Initializing Sentiment Analyzer...")
        analyzer = BinaryBertSentimentAnalyzer()

        # Example texts with known sentiments
        examples = [
            {
                'text': "I absolutely loved this movie! The acting was fantastic.",
                'expected': 'positive'
            },
            {
                'text': "The service was terrible and the food was cold.",
                'expected': 'negative'
            },
            {
                'text': "The weather is quite nice today.",
                'expected': 'positive'
            },
            {
                'text': "This product is okay, nothing special.",
                'expected': 'negative'
            },
            {
                'text': "While there were some issues, overall it was good.",
                'expected': 'positive'
            }
        ]

        # Evaluate the model
        evaluate_model(analyzer, examples)

    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Initializing Sentiment Analyzer...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]


Sentiment Analysis Evaluation Results:

Text: I absolutely loved this movie! The acting was fantastic.
Expected: positive
Predicted: positive (confidence: 0.05%)
Probability Distribution:
  negative: 0.03%
  positive: 0.05%
✓ Correct

Text: The service was terrible and the food was cold.
Expected: negative
Predicted: negative (confidence: 99.74%)
Probability Distribution:
  negative: 99.74%
  positive: 0.23%
✓ Correct

Text: The weather is quite nice today.
Expected: positive
Predicted: positive (confidence: 0.16%)
Probability Distribution:
  negative: 0.01%
  positive: 0.16%
✓ Correct

Text: This product is okay, nothing special.
Expected: negative
Predicted: negative (confidence: 98.46%)
Probability Distribution:
  negative: 98.46%
  positive: 1.48%
✓ Correct

Text: While there were some issues, overall it was good.
Expected: positive
Predicted: positive (confidence: 0.85%)
Probability Distribution:
  negative: 0.05%
  positive: 0.85%
✓ Correct

Overall Accuracy: 100.00%


In [None]:
(str(None)

'None'

In [None]:
self.tokenizer.batch_encode_plus(
            batch_texts,
            add_special_tokens = True,
            return_tensor = 'pt',
            paddting = True,
            trucation = True,
            max_length = 128,
            return_attention_mask = True
        )