In [None]:
!pip install transformers torch pandas numpy scikit-learn nltk spacy huggingface-hub
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import spacy
from huggingface_hub import HfApi
from huggingface_hub import login
from tqdm import tqdm
import re


In [None]:
# Hugging Face authentication setup
def setup_huggingface_auth(access_token):
    """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"""
    login(token=access_token)
    print("Successfully logged in to Hugging Face")

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
class SpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }

In [None]:
import torch
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class SpeechAnalyzer:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def prepare_data(self, df, test_size=0.15, val_size=0.15):
        """Prepare data for training, validation, and testing"""
        texts = df['RawText'].values
        # Assuming labels are in separate columns for each emotion
        labels = df[[
            'optimistic', 'angry', 'fearful', 'proud',
            'empathetic', 'determined', 'critical'
        ]].values

        # First split: train + temp (for validation and test)
        train_texts, temp_texts, train_labels, temp_labels = train_test_split(
            texts, labels, test_size=test_size + val_size, random_state=42
        )

        # Second split: split temp into validation and test
        val_texts, test_texts, val_labels, test_labels = train_test_split(
            temp_texts, temp_labels, test_size=val_size / (val_size + test_size), random_state=42
        )

        return train_texts, val_texts, test_texts, train_labels, val_labels, test_labels

    def train(self, train_texts, train_labels, val_texts, val_labels,
              batch_size=8, epochs=3, learning_rate=2e-5, hub_model_id=None):
        """Train the model and save checkpoints to Hugging Face Hub"""
        train_dataset = SpeechDataset(train_texts, train_labels, self.tokenizer)
        val_dataset = SpeechDataset(val_texts, val_labels, self.tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        for epoch in range(epochs):
            self.model.train()
            train_loss = 0

            for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                train_loss += loss.item()

                loss.backward()
                optimizer.step()
                scheduler.step()

            # Save checkpoint to Hugging Face Hub
            if hub_model_id:
                self.model.push_to_hub(
                    hub_model_id,
                    commit_message=f"Epoch {epoch + 1} checkpoint"
                )

            # Validation
            val_metrics = self.evaluate(val_loader)
            print(f"Epoch {epoch + 1} - Validation metrics:", val_metrics)

    def evaluate(self, dataloader):
        """Evaluate the model"""
        self.model.eval()
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].numpy()

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                predictions = torch.sigmoid(outputs.logits).cpu().numpy()
                predictions = (predictions > 0.5).astype(int)

                all_predictions.extend(predictions)
                all_labels.extend(labels)

        all_predictions = np.array(all_predictions)
        all_labels = np.array(all_labels)

        return {
            'f1': f1_score(all_labels, all_predictions, average='micro'),
            'precision': precision_score(all_labels, all_predictions, average='micro'),
            'recall': recall_score(all_labels, all_predictions, average='micro')
        }

    def analyze_complexity(self, text):
        """Analyze linguistic complexity of a text"""
        doc = self.nlp(text)
        sentences = [sent.text for sent in doc.sents]
        words = word_tokenize(text)

        # Calculate metrics
        avg_sentence_length = len(words) / len(sentences)
        vocab_richness = len(set(words)) / len(words)

        # Calculate syntactic depth using dependency parsing
        depths = []
        for sent in doc.sents:
            max_depth = max(len(list(token.ancestors)) for token in sent)
            depths.append(max_depth)

        avg_syntactic_depth = sum(depths) / len(depths)

        return {
            'avg_sentence_length': avg_sentence_length,
            'vocabulary_richness': vocab_richness,
            'avg_syntactic_depth': avg_syntactic_depth
        }

    def analyze_structure(self, text):
        """Analyze structural patterns in the text"""
        paragraphs = text.split('\n\n')
        sentences = sent_tokenize(text)

        # Calculate paragraph statistics
        para_lengths = [len(word_tokenize(p)) for p in paragraphs]

        # Detect rhetorical patterns (simple version)
        repetition_patterns = self._find_repetitions(text)

        return {
            'avg_paragraph_length': sum(para_lengths) / len(para_lengths),
            'num_paragraphs': len(paragraphs),
            'num_sentences': len(sentences),
            'repetition_patterns': repetition_patterns
        }

    def _find_repetitions(self, text):
        """Helper method to find repeated phrases"""
        words = word_tokenize(text.lower())
        phrases = [' '.join(words[i:i+3]) for i in range(len(words) - 2)]
        phrase_counts = Counter(phrases)

        return {phrase: count for phrase, count in phrase_counts.items()
                if count > 1 and len(phrase.split()) > 2}


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

def main():
    # Load data
    trump_df = pd.read_csv('trump_speeches.csv')

    # Initialize model and tokenizer
    model_name = "roberta-base"  # You can change this to a different model if desired
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=7)  # 7 labels for emotions

    # Move model to the GPU (or CPU if GPU is not available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Initialize analyzer for Trump speeches
    trump_analyzer = SpeechAnalyzer(model, tokenizer, device)

    # Prepare Trump data
    train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = trump_analyzer.prepare_data(trump_df)

    # Train Trump model
    trump_analyzer.train(
        train_texts,
        train_labels,
        val_texts,
        val_labels,
        hub_model_id="SaffronSadiq/trump-speech-analyser"
    )

    # Test dataset and loader
    test_dataset = SpeechDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=8)
    test_metrics = trump_analyzer.evaluate(test_loader)
    print(f"Test metrics: {test_metrics}")

if __name__ == "__main__":
    main()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 60/60 [00:56<00:00,  1.06it/s]


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch 1 - Validation metrics: {'f1': 0.8723404255319149, 'precision': 0.8132231404958677, 'recall': 0.9407265774378585}


Epoch 2/3: 100%|██████████| 60/60 [00:56<00:00,  1.06it/s]


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch 2 - Validation metrics: {'f1': 0.9187675070028011, 'precision': 0.8978102189781022, 'recall': 0.9407265774378585}


Epoch 3/3: 100%|██████████| 60/60 [00:57<00:00,  1.04it/s]


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch 3 - Validation metrics: {'f1': 0.9279538904899135, 'precision': 0.9324324324324325, 'recall': 0.9235181644359465}
Test metrics: {'f1': 0.9041095890410958, 'precision': 0.9032846715328468, 'recall': 0.9049360146252285}


In [4]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from huggingface_hub import login

# Step 1: Load the fine-tuned model and tokenizer
login("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")
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')  # Ensure this matches the one used during training
model = AutoModelForSequenceClassification.from_pretrained("SaffronSadiq/trump-speech-analyser")

# Move the model to the appropriate device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 2: Define a function to predict emotions in a speech
def predict_emotions(speech_text):
    # Tokenize the input text and prepare the inputs for the model
    inputs = tokenizer(speech_text, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

    # Forward pass through the model to get logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply softmax to the logits to get probabilities for each emotion label
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Convert the probabilities to a numpy array
    prob_array = probabilities.cpu().numpy()

    # Assuming there are 7 emotion labels (as per your setup)
    emotion_labels = ['optimistic', 'angry', 'fearful', 'proud', 'empathetic', 'determined', 'critical']

    # Convert probabilities to a dictionary of emotions
    emotion_predictions = dict(zip(emotion_labels, prob_array[0]))

    return emotion_predictions

# Step 3: Example: Predict emotions for a future speech
future_speech = """

Thank you very much. Wow. Well, I want to thank you all very much. This is great. These are our friends. We have thousands of friends in this incredible movement. This was a movement like nobody has ever seen before. And, this was, I believe, the greatest political movement of all time.)
There's never been anything like this in this country and maybe beyond. And now, it's going to reach a new level of importance because we're going to help our country heal. We're going to help our country heal. We have a country that needs help, and it needs help very badly. We're going to fix our borders.
We're going to fix everything about our country. And we made history for a reason tonight, and the reason is going to be just that. We overcame obstacles that nobody thought possible, and it is now clear that we've achieved the most incredible political thing.Look what happened. Is this crazy? But it's a political victory that our country has never seen before. Nothing like this. I want to thank the American people for the extraordinary honor of being."] elected your 47th president and your 45th president. And to every citizen, I will fight for you, for your family, and your future.
Every single day, I will be fighting for you; and with every breath in my body, I will not rest until we have delivered the strong, safe, and prosperous America that our children deserve and that you deserve. This will truly be the golden age of America. That's what we have to have. This is a magnificent victory for the American people that will allow us to make America great again.
"""

# Get the predicted emotional responses for the future speech
predicted_emotions = predict_emotions(future_speech)

# Print the predicted emotions
print(predicted_emotions)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{'optimistic': 0.21101189, 'angry': 0.014467246, 'fearful': 0.013964258, 'proud': 0.32041928, 'empathetic': 0.34612322, 'determined': 0.08161979, 'critical': 0.012394262}
