# NER Word Cloud Visualization

This notebook creates interactive word clouds to visualize named entity frequencies over time. Use the interactive widgets to select different files, years, and categories to explore how entity frequencies change.

In [13]:
# Import required libraries
import json
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
from PIL import Image
import matplotlib.colors as mcolors
import glob
import re

In [14]:
# Functions to load frequency data
def load_frequency_data(json_path):
    """Load NER frequency data from a JSON file."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def load_all_frequency_files(root_folder):
    """Load all JSON frequency files from the root folder and its subfolders."""
    json_files = {}
    
    # Find all JSON files
    for file_path in glob.glob(os.path.join(root_folder, '**/*.json'), recursive=True):
        file_name = os.path.basename(file_path)
        # Store the file name and full path
        json_files[file_name] = file_path
    
    return json_files

In [15]:
# Configuration - Update this path
# Path to the root folder containing your frequency JSON files
root_folder = "/scratch/bhx5gh/IndependentStudy/NERResults/Processed/Spacy/text"

# Load all JSON files
json_files = load_all_frequency_files(root_folder)
print(f"Found {len(json_files)} JSON files in {root_folder} and its subfolders.")

# Initialize an empty container for frequency data
freq_data_all = {}

# Initialize variables to track available years and categories across all files
all_years = set()
all_categories = set()

Found 12 JSON files in /scratch/bhx5gh/IndependentStudy/NERResults/Processed/Spacy/text and its subfolders.


In [16]:
# Function to generate a word cloud for a specific file, year and category
def generate_wordcloud(file_name, year, category, max_words=100):
    """Generate a WordCloud for the specified file, year and category."""
    # Check if the file data is loaded
    if file_name not in freq_data_all:
        # Load the data if not already loaded
        freq_data_all[file_name] = load_frequency_data(json_files[file_name])
    
    freq_data = freq_data_all[file_name]
    
    if year not in freq_data or category not in freq_data[year]:
        return None, {}
    
    # Get entity frequencies for the selected year and category
    entities = freq_data[year][category]
    
    if not entities:
        return None, {}
    
    # Create a WordCloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        max_words=max_words,
        colormap='viridis',
        random_state=42
    ).generate_from_frequencies(entities)
    
    return wordcloud, entities

In [17]:
# Function to display a word cloud for a selected file, year and category
def display_wordcloud(file_name, year, category):
    """Display a word cloud for a specific file, year and category."""
    # Check if the file data is loaded
    if file_name not in freq_data_all:
        # Load the data if not already loaded
        freq_data_all[file_name] = load_frequency_data(json_files[file_name])
    
    freq_data = freq_data_all[file_name]
    
    if year not in freq_data:
        print(f"No data available for year {year} in {file_name}")
        return
    
    if category not in freq_data[year]:
        print(f"No data available for category {category} in year {year} in {file_name}")
        return
    
    # Generate word cloud
    wordcloud, entities = generate_wordcloud(file_name, year, category)
    
    if wordcloud is None:
        print(f"Could not generate word cloud for {category} in {year} from {file_name}")
        return
    
    # Create figure
    plt.figure(figsize=(16, 8))
    
    # Display the word cloud
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"{category} Entities in {year} from {file_name}", fontsize=16)
    plt.axis('off')
    
    # Display the top 10 entities with frequencies
    top_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)[:10]
    top_entities_text = '\n'.join([f"{entity}: {freq}" for entity, freq in top_entities])
    plt.figtext(0.92, 0.5, f"Top 10 Entities:\n{top_entities_text}", 
               fontsize=12, verticalalignment='center', horizontalalignment='right',
               bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.7))
    
    plt.tight_layout()
    plt.show()

In [18]:
# Function to update available years and categories for the selected file
def update_years_categories(file_name):
    """Update available years and categories for the selected file."""
    # Check if the file data is loaded
    if file_name not in freq_data_all:
        # Load the data if not already loaded
        freq_data_all[file_name] = load_frequency_data(json_files[file_name])
    
    freq_data = freq_data_all[file_name]
    
    # Update available years and categories
    years = sorted(freq_data.keys())
    
    categories = set()
    for year_data in freq_data.values():
        categories.update(year_data.keys())
    categories = sorted(list(categories))
    
    return years, categories

In [19]:
# Create interactive widgets
file_dropdown = widgets.Dropdown(
    options=sorted(json_files.keys()),
    description='File:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)

# Initialize with the first file
if json_files:
    first_file = sorted(json_files.keys())[0]
    years, categories = update_years_categories(first_file)
else:
    years, categories = [], []

# Create a slider widget for years with labels
# The slider represents the index in the years list
year_slider = widgets.IntSlider(
    min=0,
    max=max(0, len(years) - 1),  # Prevent errors with empty list
    step=1,
    value=0,
    description='Year:',
    continuous_update=False,
    orientation='horizontal',
    readout=False,  # Don't show index value
    layout=widgets.Layout(width='50%')
)

# Create a label to display the selected year
year_label = widgets.Label(
    value=years[0] if years else "No years available"
)

# Connect the slider to the label
def on_year_slider_change(change):
    if change['name'] == 'value' and years:
        idx = change['new']
        if 0 <= idx < len(years):
            year_label.value = years[idx]

year_slider.observe(on_year_slider_change, names='value')

category_dropdown = widgets.Dropdown(
    options=categories,
    description='Category:',
    disabled=False,
    layout=widgets.Layout(width='30%')
)

In [20]:
# Function to update year and category dropdowns when file changes
def on_file_change(change):
    """Update year slider, year label, and category dropdown when file selection changes."""
    if change['type'] == 'change' and change['name'] == 'value':
        file_name = change['new']
        years, categories = update_years_categories(file_name)
        
        # Update the slider with new years
        year_slider.max = max(0, len(years) - 1)
        year_slider.value = 0
        
        # Update the year label
        year_label.value = years[0] if years else "No years available"
        
        # Update category dropdown
        category_dropdown.options = categories
        if categories:
            category_dropdown.value = categories[0]

file_dropdown.observe(on_file_change)

In [21]:
# Function to update the visualization
output = widgets.Output()

def update_visualization(file, year_idx, category):
    """Update the visualization based on selected file, year index, and category."""
    with output:
        clear_output(wait=True)
        
        # Get the years for the current file
        current_years, _ = update_years_categories(file)
        
        # Check if there are years available
        if not current_years:
            print(f"No years available for {file}")
            return
        
        # Get the year string from the index
        if 0 <= year_idx < len(current_years):
            year = current_years[year_idx]
            display_wordcloud(file, year, category)
        else:
            print(f"Invalid year index: {year_idx}")

# Create a HBox for year slider and its label
year_widget = widgets.HBox([year_slider, year_label])

# Interactive widget for visualization
interactive_wordcloud = widgets.interactive(update_visualization,
                                           file=file_dropdown,
                                           year_idx=year_slider,
                                           category=category_dropdown)

# Display the widgets and output
display(interactive_wordcloud)
display(output)

interactive(children=(Dropdown(description='File:', layout=Layout(width='50%'), options=('all_freqs.json', 'bx…

Output()

In [22]:
# Function to find entities by substring (case insensitive)
def find_matching_entities(search_term, file_name, category=None):
    """Find all entities that contain the search term as a substring (case insensitive)."""
    # Check if the file data is loaded
    if file_name not in freq_data_all:
        # Load the data if not already loaded
        freq_data_all[file_name] = load_frequency_data(json_files[file_name])
    
    freq_data = freq_data_all[file_name]
    matching_entities = {}
    
    search_term = search_term.lower()
    
    for year in freq_data:
        if category is not None and category != 'All Categories':
            # Search only in the specified category
            if category in freq_data[year]:
                for entity, freq in freq_data[year][category].items():
                    if search_term in entity.lower():
                        if entity not in matching_entities:
                            matching_entities[entity] = []
                        matching_entities[entity].append((year, category, freq))
        else:
            # Search in all categories
            for cat in freq_data[year]:
                for entity, freq in freq_data[year][cat].items():
                    if search_term in entity.lower():
                        if entity not in matching_entities:
                            matching_entities[entity] = []
                        matching_entities[entity].append((year, cat, freq))
                        
    return matching_entities

In [23]:
# Function to plot entity trends
def plot_entity_trend(search_term, file_name, category=None):
    """Plot frequency trends for entities containing the search term."""
    # Find matching entities
    matching_entities = find_matching_entities(search_term, file_name, category)
    
    if not matching_entities:
        print(f"No entities containing '{search_term}' found in {file_name}")
        if category is not None and category != 'All Categories':
            print(f"Category: {category}")
        return
    
    # Extract all years with data
    years = set()
    for entity_data in matching_entities.values():
        years.update([year for year, _, _ in entity_data])
    years = sorted(list(years))
    
    # Create a figure for plotting trends
    plt.figure(figsize=(14, 7))
    
    # Plot each entity's trend
    for entity, entity_data in matching_entities.items():
        # Organize data by year
        year_freqs = {}
        for year, cat, freq in entity_data:
            if year not in year_freqs:
                year_freqs[year] = 0
            year_freqs[year] += freq
        
        # Create data points for all years (filling in zeros for missing years)
        x_data = years
        y_data = [year_freqs.get(year, 0) for year in years]
        
        # Plot the trend
        plt.plot(x_data, y_data, marker='o', linestyle='-', linewidth=2, label=entity)
    
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.xlabel('Year', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    
    title = f"Frequency Trends for Entities Containing '{search_term}' in {file_name}"
    if category is not None and category != 'All Categories':
        title += f" (Category: {category})"
    plt.title(title, fontsize=16)
    
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
    
    # Display entity totals
    print(f"Found {len(matching_entities)} entities containing '{search_term}'")
    
    # Display total frequency for each entity
    entity_totals = {}
    for entity, entity_data in matching_entities.items():
        entity_totals[entity] = sum([freq for _, _, freq in entity_data])
    
    for entity, total in sorted(entity_totals.items(), key=lambda x: x[1], reverse=True):
        print(f"{entity}: {total}")

In [24]:
# Interactive entity trend analyzer
trend_file_dropdown = widgets.Dropdown(
    options=sorted(json_files.keys()),
    description='File:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)

# Initialize categories for trend analysis
if json_files:
    first_file = sorted(json_files.keys())[0]
    _, trend_categories = update_years_categories(first_file)
    trend_categories = ['All Categories'] + trend_categories
else:
    trend_categories = ['All Categories']

trend_category_dropdown = widgets.Dropdown(
    options=trend_categories,
    value='All Categories',
    description='Category:',
    disabled=False,
    layout=widgets.Layout(width='30%')
)

# Function to update trend category dropdown when file changes
def on_trend_file_change(change):
    """Update category dropdown for trend analysis when file selection changes."""
    if change['type'] == 'change' and change['name'] == 'value':
        file_name = change['new']
        _, categories = update_years_categories(file_name)
        trend_category_dropdown.options = ['All Categories'] + categories
        trend_category_dropdown.value = 'All Categories'

trend_file_dropdown.observe(on_trend_file_change)

entity_input = widgets.Text(
    value='',
    placeholder='Enter search term (e.g. "trump")',
    description='Search:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)

analyze_button = widgets.Button(
    description='Analyze Trend',
    disabled=False,
    button_style='', 
    tooltip='Click to analyze frequency trends for entities containing the search term'
)

trend_output = widgets.Output()

def on_button_clicked(b):
    with trend_output:
        clear_output()
        search_term = entity_input.value.strip()
        file_name = trend_file_dropdown.value
        category = trend_category_dropdown.value
        
        if not search_term:
            print("Please enter a search term")
            return
        
        plot_entity_trend(search_term, file_name, category)

analyze_button.on_click(on_button_clicked)

# Create a title for the trend analyzer
trend_title = widgets.HTML(value="<h3>Entity Trend Analyzer</h3>")

# Display the trend analyzer widgets
display(trend_title)
display(widgets.VBox([
    widgets.HBox([trend_file_dropdown, trend_category_dropdown]),
    widgets.HBox([entity_input, analyze_button])
]))
display(trend_output)

HTML(value='<h3>Entity Trend Analyzer</h3>')

VBox(children=(HBox(children=(Dropdown(description='File:', layout=Layout(width='50%'), options=('all_freqs.js…

Output()