# Most Used Words Analysis

This notebook analyzes the most used words in Donald Trump's speeches using the `SpeechCorpus` tool. We will filter the speeches by various criteria (Campaign, Rally, Location) and display the top 10 most frequent words for each subset.

In [4]:
import sys
from pathlib import Path
import pandas as pd
from collections import Counter
import re

# Add src to path
sys.path.append(str(project_root / "src"))

from filtering_corpus.speech_corpus import SpeechCorpus

In [5]:
def get_top_words(corpus, n=3, stop_words=None):
    """
    Calculates the top n most used words in the corpus transcriptions.
    """
    if stop_words is None:
        stop_words = set(['the', 'don', 'applause', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'i', 'for', 'it', 'you', 'we', 'are', 'on', 'this', 'have', 'be', 'with', 'they', 'as', 'not', 'will', 'at', 'our', 'my', 'was', 'but', 'by', 'he', 'she', 'so', 'what', 'all', 'if', 'their', 'who', 'me', 'or', 'do', 'has', 'from', 'an', 'no', 'one', 'would', 'there', 'can', 'about', 'just', 'out', 'up', 'when', 'like', 'them', 'your', 'go', 'get', 'know', 'very', 'going', 'people', 'because', 'now', 'had', 'were', 'been', 'than', 'back', 'see', 'time', 'some', 'could', 'did', 'make', 'us', 'said', 'say', 'got', 'him', 'his', 'her', 'down', 'only', 'want', 'think', 'right', 'look', 'take', 'way', 'how', 'come', 'its', 'over', 'then', 'also', 'even', 'much', 'more', 'these', 'those', 'where', 'why', 'which', 'here', 'well', 'many', 'other', 'really', 'too', 'should', 'never', 'good', 'great', 'big', 'lot', 'thing', 'things'])

    all_text = " ".join(corpus.transcriptions['text'].astype(str))
    
    # Simple tokenization (remove punctuation and lowercase)
    words = re.findall(r'\b\w+\b', all_text.lower())
    
    # Filter stop words and short words
    filtered_words = [w for w in words if w not in stop_words and len(w) > 2]
    
    counter = Counter(filtered_words)
    return counter.most_common(n)

## Initialize Corpus

We'll try to use the cleaned transcriptions if available, otherwise fallback to raw.

In [6]:
corpus = SpeechCorpus(data_dir="../data", transcription_file="transcriptions.parquet")
print("Using transcriptions.")

print(f"Total speeches: {len(corpus.speeches)}")

Using transcriptions.
Total speeches: 894


## Analysis by Campaign

In [7]:
campaigns = ["2016", "2020", "2024"]

for campaign in campaigns:
    sub_corpus = corpus.get_campaign(campaign)
    top_words = get_top_words(sub_corpus)
    print(f"\n--- {campaign} Campaign (Speeches: {len(sub_corpus.speeches)}) ---")
    for word, count in top_words:
        print(f"{word}: {count}")


--- 2016 Campaign (Speeches: 306) ---
country: 6931
gonna: 4436
love: 3718

--- 2020 Campaign (Speeches: 153) ---
thank: 4304
years: 4256
country: 3885

--- 2024 Campaign (Speeches: 227) ---
country: 10561
thank: 6561
years: 5602


## Analysis: Rallies vs Non-Rallies

In [8]:
rallies = corpus.get_rallies(is_rally=True)
non_rallies = corpus.get_rallies(is_rally=False)

print(f"\n--- Rallies (Speeches: {len(rallies.speeches)}) ---")
for word, count in get_top_words(rallies):
    print(f"{word}: {count}")

print(f"\n--- Non-Rallies (Speeches: {len(non_rallies.speeches)}) ---")
for word, count in get_top_words(non_rallies):
    print(f"{word}: {count}")


--- Rallies (Speeches: 380) ---
country: 14826
thank: 9580
years: 9025

--- Non-Rallies (Speeches: 514) ---
country: 12182
thank: 8752
years: 6899


## Analysis by Location: Pennsylvania

In [9]:
pa_corpus = corpus.get_by_location("Pennsylvania")
print(f"\n--- Pennsylvania Speeches (Speeches: {len(pa_corpus.speeches)}) ---")
for word, count in get_top_words(pa_corpus):
    print(f"{word}: {count}")


--- Pennsylvania Speeches (Speeches: 43) ---
country: 1835
years: 1164
pennsylvania: 1102


## Analysis by Category: Economy

In [10]:
economy_corpus = corpus.get_by_category("Economy")
print(f"\n--- Economy Speeches (Speeches: {len(economy_corpus.speeches)}) ---")
for word, count in get_top_words(economy_corpus):
    print(f"{word}: {count}")


--- Economy Speeches (Speeches: 169) ---
country: 4585
thank: 2968
years: 2613
