In [1]:
import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# For dimensionality reduction
import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Set plotly template
pio.templates.default = "plotly_white"

print("📚 Libraries loaded successfully!")


📚 Libraries loaded successfully!


In [2]:
# Load the topic modeling results
print("📊 Loading topic modeling data...")

# Load posts with topics
df = pd.read_csv('../data/truth_social_posts_with_topics.csv')
df['date_parsed'] = pd.to_datetime(df['date_parsed'])

# Load topic summary
with open('../data/topic_modeling_summary.json', 'r') as f:
    topic_summary = json.load(f)

print(f"✅ Loaded {len(df):,} posts with topic assignments")
print(f"📅 Date range: {df['date_parsed'].min()} to {df['date_parsed'].max()}")

# Extract topic probability columns
topic_prob_cols = [col for col in df.columns if col.startswith('topic_') and col.endswith('_prob')]
print(f"🎯 Found {len(topic_prob_cols)} topic probability columns")

df.head()


📊 Loading topic modeling data...
✅ Loaded 1,817 posts with topic assignments
📅 Date range: 2025-01-01 10:48:00 to 2025-07-20 20:53:00
🎯 Found 10 topic probability columns


Unnamed: 0,speaker,handle,platform,post_url,content_text,date_parsed,dominant_topic,topic_name,topic_confidence,topic_1_prob,...,topic_5_prob,topic_6_prob,topic_7_prob,topic_8_prob,topic_9_prob,topic_10_prob,hour,day_of_week,month,year_month
0,Donald Trump,@realDonaldTrump,Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,Adam “Shifty” Schiff is in BIG TROUBLE! He fal...,2025-07-20 20:53:00,8,Topic 9: biden-border-joe,0.363033,0.224121,...,0.113687,0.004762,0.004762,0.004763,0.363033,0.270586,20,Sunday,7,2025-07
1,Donald Trump,@realDonaldTrump,Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,Go get the GREAT NEW BOOK by Mark Levin. It’s ...,2025-07-20 20:06:00,4,Topic 5: news-fake-new,0.637996,0.009091,...,0.637996,0.009093,0.289263,0.009094,0.009091,0.009095,20,Sunday,7,2025-07
2,Donald Trump,@realDonaldTrump,Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,HOW DID SAMANTHA POWER MAKE ALL OF THAT MONEY???,2025-07-20 19:56:00,4,Topic 5: news-fake-new,0.699954,0.033339,...,0.699954,0.03334,0.033333,0.033341,0.033339,0.03334,19,Sunday,7,2025-07
3,Donald Trump,@realDonaldTrump,Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,Watch Mark Levin on FoxNews! NOW!!!,2025-07-20 19:53:00,4,Topic 5: news-fake-new,0.419999,0.02,...,0.419999,0.020003,0.419997,0.02,0.020001,0.02,19,Sunday,7,2025-07
4,Donald Trump,@realDonaldTrump,Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,“Greatest Bitcoin explanation of all time”,2025-07-20 18:50:00,4,Topic 5: news-fake-new,0.699972,0.033338,...,0.699972,0.033337,0.033334,0.033344,0.033334,0.033336,18,Sunday,7,2025-07


In [3]:
# Prepare topic information for visualization
print("🎨 Preparing topic information for visualization...")

# Create topic color mapping
n_topics = len(topic_prob_cols)
colors = px.colors.qualitative.Set3[:n_topics]  # Use qualitative colors

# Create topic information dictionary
topic_info = {}
for topic_key, topic_data in topic_summary['topics'].items():
    topic_id = topic_data['id']
    topic_info[topic_id] = {
        'name': topic_data['name'],
        'short_name': topic_data['name'].split(': ')[1] if ': ' in topic_data['name'] else topic_data['name'],
        'top_words': ', '.join(topic_data['top_words'][:5]),
        'num_posts': topic_data['num_posts'],
        'percentage': topic_data['percentage'],
        'color': colors[topic_id]
    }

print("📋 Topic Information:")
for topic_id, info in topic_info.items():
    print(f"   Topic {topic_id + 1}: {info['short_name']} ({info['num_posts']} posts, {info['percentage']:.1f}%)")


🎨 Preparing topic information for visualization...
📋 Topic Information:
   Topic 1: tariff-canada-country (243 posts, 13.4%)
   Topic 2: dollar-billion-tax (160 posts, 8.8%)
   Topic 3: war-thank-deal (239 posts, 13.2%)
   Topic 4: secure-energy-complete (106 posts, 5.8%)
   Topic 5: news-fake-new (190 posts, 10.5%)
   Topic 6: good-order-elon (187 posts, 10.3%)
   Topic 7: announce-pleased-pleased announce (202 posts, 11.1%)
   Topic 8: house-white-white house (174 posts, 9.6%)
   Topic 9: biden-border-joe (147 posts, 8.1%)
   Topic 10: republican-democrat-judge (169 posts, 9.3%)


In [4]:
# Extract topic probability matrix for dimensionality reduction
print("🔍 Preparing data for dimensionality reduction...")

# Get topic probability matrix
topic_probs = df[topic_prob_cols].values
print(f"📊 Topic probability matrix shape: {topic_probs.shape}")

# Standardize the data (optional, but can help)
scaler = StandardScaler()
topic_probs_scaled = scaler.fit_transform(topic_probs)

print(f"✅ Data prepared for dimensionality reduction")
print(f"   Original dimensions: {topic_probs.shape[1]}D")
print(f"   Target dimensions: 3D")


🔍 Preparing data for dimensionality reduction...
📊 Topic probability matrix shape: (1817, 10)
✅ Data prepared for dimensionality reduction
   Original dimensions: 10D
   Target dimensions: 3D


In [5]:
# Apply UMAP for 3D dimensionality reduction
print("🗺️ Applying UMAP dimensionality reduction...")

# Try importing UMAP from the correct location to avoid AttributeError
try:
    UMAP = umap.UMAP
except AttributeError:
    try:
        from umap.umap_ import UMAP
    except ImportError:
        raise ImportError(
            "UMAP could not be imported. Please ensure the 'umap-learn' package is installed."
        )
else:
    # If import from umap.UMAP works, alias it for consistency
    from types import SimpleNamespace
    UMAP = umap.UMAP

# UMAP parameters for good 3D clustering
umap_3d = UMAP(
    n_components=3,
    n_neighbors=15,  # Balance between local and global structure
    min_dist=0.1,    # Minimum distance between points
    metric='cosine', # Good for probability distributions
    random_state=42
)

# Fit and transform
umap_coords = umap_3d.fit_transform(topic_probs_scaled)

print(f"✅ UMAP completed!")
print(f"   3D coordinates shape: {umap_coords.shape}")
print(f"   X range: [{umap_coords[:, 0].min():.2f}, {umap_coords[:, 0].max():.2f}]")
print(f"   Y range: [{umap_coords[:, 1].min():.2f}, {umap_coords[:, 1].max():.2f}]")
print(f"   Z range: [{umap_coords[:, 2].min():.2f}, {umap_coords[:, 2].max():.2f}]")

# Add coordinates to dataframe
df['x_3d'] = umap_coords[:, 0]
df['y_3d'] = umap_coords[:, 1]
df['z_3d'] = umap_coords[:, 2]


🗺️ Applying UMAP dimensionality reduction...


  warn(


✅ UMAP completed!
   3D coordinates shape: (1817, 3)
   X range: [-6.89, 16.96]
   Y range: [-6.07, 36.25]
   Z range: [-6.56, 11.60]


In [6]:
# Create hover text with post content and topic information
print("📝 Creating hover text with post content...")

import textwrap

def create_hover_text(row, wrap_width=60):
    """Create rich hover text for each post with multiline content and text wrapping"""
    # Truncate content if too long
    content = row['content_text']
    if len(content) > 200:
        content = content[:200] + "..."
    
    # Wrap the content to avoid overflow in hover
    wrapped_lines = textwrap.wrap(content, width=wrap_width)
    # Join wrapped lines with <br> for multiline display in hover
    content_multiline = "<br>".join(wrapped_lines)
    
    # Get topic info
    topic_id = row['dominant_topic']
    topic_name = topic_info[topic_id]['short_name']
    confidence = row['topic_confidence']
    
    # Format date
    date_str = row['date_parsed'].strftime('%Y-%m-%d %H:%M')
    
    hover_text = (
        f"<b>Date:</b> {date_str}<br>"
        f"<b>Topic:</b> {topic_name} (Confidence: {confidence:.3f})<br>"
        f"<b>Content:</b><br>{content_multiline}<br>"
        f"<b>Speaker:</b> {row['speaker']}"
    )
    
    return hover_text

# Apply to dataframe
df['hover_text'] = df.apply(create_hover_text, axis=1)

print(f"✅ Hover text created for {len(df)} posts")
print("\n📋 Sample hover text:")
print(df['hover_text'].iloc[0])


📝 Creating hover text with post content...
✅ Hover text created for 1817 posts

📋 Sample hover text:
<b>Date:</b> 2025-07-20 20:53<br><b>Topic:</b> biden-border-joe (Confidence: 0.363)<br><b>Content:</b><br>Adam “Shifty” Schiff is in BIG TROUBLE! He falsified Loan<br>Documents. He once said my son would go to prison on a SCAM<br>that Schiff, along with other Crooked Dems, illegally<br>“manufactured” in order to s...<br><b>Speaker:</b> Donald Trump


In [7]:
# Create the 3D interactive scatter plot
print("🎨 Creating 3D interactive scatter plot...")

# Create the main 3D scatter plot
fig = go.Figure()

# Add points for each topic
for topic_id in sorted(topic_info.keys()):
    topic_mask = df['dominant_topic'] == topic_id
    topic_data = df[topic_mask]
    
    if len(topic_data) == 0:
        continue
    
    fig.add_trace(go.Scatter3d(
        x=topic_data['x_3d'],
        y=topic_data['y_3d'],
        z=topic_data['z_3d'],
        mode='markers',
        name=f"Topic {topic_id + 1}: {topic_info[topic_id]['short_name']}",
        marker=dict(
            size=5,
            color=topic_info[topic_id]['color'],
            opacity=0.8,
            line=dict(width=0.5, color='darkgray')
        ),
        text=topic_data['hover_text'],
        hovertemplate='%{text}<extra></extra>',  # Hide the trace name in hover
        hoverinfo='text'
    ))

# Update layout for better visualization
fig.update_layout(
    title=dict(
        text="🎯 3D Interactive Topic Clustering - Truth Social Posts<br><sub>Hover over points to see post content | Colors represent different topics</sub>",
        x=0.5,
        font=dict(size=20)
    ),
    scene=dict(
        xaxis_title="UMAP Dimension 1",
        yaxis_title="UMAP Dimension 2",
        zaxis_title="UMAP Dimension 3",
        camera=dict(
            eye=dict(x=1.5, y=1.5, z=1.5)  # Good initial viewing angle
        ),
        bgcolor="rgba(240,240,240,0.1)",
        xaxis=dict(gridcolor="lightgray", gridwidth=1),
        yaxis=dict(gridcolor="lightgray", gridwidth=1),
        zaxis=dict(gridcolor="lightgray", gridwidth=1)
    ),
    width=1300,
    height=800,
    font=dict(size=12),
    legend=dict(
        x=0.02,
        y=0.98,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=1
    )
)

print("✅ 3D scatter plot created!")
print(f"   📊 {len(df)} posts plotted across {n_topics} topics")
print("   🖱️  Interactive features:")
print("      - Hover over points to see post content")
print("      - Click and drag to rotate the plot")
print("      - Scroll to zoom in/out")
print("      - Click legend items to show/hide topics")

# Display the plot
fig.show()


🎨 Creating 3D interactive scatter plot...
✅ 3D scatter plot created!
   📊 1817 posts plotted across 10 topics
   🖱️  Interactive features:
      - Hover over points to see post content
      - Click and drag to rotate the plot
      - Scroll to zoom in/out
      - Click legend items to show/hide topics


In [8]:
# Save the interactive plot as HTML file
print("💾 Saving interactive visualization...")

# Save 3D plot
fig.write_html("../images/3d_topic_clustering_interactive.html")
print("✅ 3D interactive plot saved to '../images/3d_topic_clustering_interactive.html'")

# Save the enhanced dataset with 3D coordinates
df_export = df[[
    'speaker', 'handle', 'content_text', 'date_parsed',
    'dominant_topic', 'topic_name', 'topic_confidence',
    'x_3d', 'y_3d', 'z_3d'
]].copy()

df_export.to_csv('../data/truth_social_posts_with_3d_coords.csv', index=False)
print("✅ Enhanced dataset with 3D coordinates saved to '../data/truth_social_posts_with_3d_coords.csv'")

print("\n🎉 3D Interactive Topic Clustering Complete!")
print("📁 Files created:")
print("   📊 3d_topic_clustering_interactive.html - Interactive 3D visualization")
print("   📄 truth_social_posts_with_3d_coords.csv - Data with 3D coordinates")
print("\n🖱️  Open the HTML file in your browser to interact with the visualization!")


💾 Saving interactive visualization...
✅ 3D interactive plot saved to '../images/3d_topic_clustering_interactive.html'
✅ Enhanced dataset with 3D coordinates saved to '../data/truth_social_posts_with_3d_coords.csv'

🎉 3D Interactive Topic Clustering Complete!
📁 Files created:
   📊 3d_topic_clustering_interactive.html - Interactive 3D visualization
   📄 truth_social_posts_with_3d_coords.csv - Data with 3D coordinates

🖱️  Open the HTML file in your browser to interact with the visualization!
