In [9]:
import pandas as pd
import json
from datetime import datetime

# Load the JSON data
with open('../data/truth_social_posts_final.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded {len(data)} posts from JSON file")


Loaded 6650 posts from JSON file


In [10]:
# Convert JSON to DataFrame
df = pd.DataFrame(data)

print("DataFrame created successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nDataFrame info:")
df.info()


DataFrame created successfully!
Shape: (6650, 10)
Columns: ['speaker', 'handle', 'date', 'platform', 'post_url', 'content_text', 'content_html', 'image_url', 'deleted_flag', 'content_links']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6650 entries, 0 to 6649
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   speaker        6650 non-null   object
 1   handle         6650 non-null   object
 2   date           6650 non-null   object
 3   platform       6650 non-null   object
 4   post_url       6450 non-null   object
 5   content_text   5363 non-null   object
 6   content_html   5363 non-null   object
 7   image_url      6650 non-null   object
 8   deleted_flag   6650 non-null   bool  
 9   content_links  2620 non-null   object
dtypes: bool(1), object(9)
memory usage: 474.2+ KB


In [11]:
# Display first few rows of the DataFrame
print("First 5 rows:")
df.head()


First 5 rows:


Unnamed: 0,speaker,handle,date,platform,post_url,content_text,content_html,image_url,deleted_flag,content_links
0,Donald Trump,@realDonaldTrump,"July 20, 2025 @ 8:53 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,Adam “Shifty” Schiff is in BIG TROUBLE! He fal...,"<div class=""text-sm font-medium whitespace-pre...",https://media-cdn.factba.se/realdonaldtrump-tr...,False,
1,Donald Trump,@realDonaldTrump,"July 20, 2025 @ 8:16 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,RT: https://truthsocial.com/users/realDonaldTr...,"<div class=""text-sm font-medium whitespace-pre...",https://media-cdn.factba.se/realdonaldtrump-tr...,False,
2,Donald Trump,@realDonaldTrump,"July 20, 2025 @ 8:06 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,Go get the GREAT NEW BOOK by Mark Levin. It’s ...,"<div class=""text-sm font-medium whitespace-pre...",https://media-cdn.factba.se/realdonaldtrump-tr...,False,
3,Donald Trump,@realDonaldTrump,"July 20, 2025 @ 8:06 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,RT: https://truthsocial.com/users/realDonaldTr...,"<div class=""text-sm font-medium whitespace-pre...",https://media-cdn.factba.se/realdonaldtrump-tr...,False,
4,Donald Trump,@realDonaldTrump,"July 20, 2025 @ 7:56 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,HOW DID SAMANTHA POWER MAKE ALL OF THAT MONEY???,"<div class=""text-sm font-medium whitespace-pre...",https://media-cdn.factba.se/realdonaldtrump-tr...,False,


In [12]:
# Basic data exploration
print("Data Summary:")
print("=============")
print(f"Total posts: {len(df)}")
print(f"Unique speakers: {df['speaker'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Posts with images: {df['image_url'].notna().sum()}")
print(f"Deleted posts: {df['deleted_flag'].sum()}")

print("\nSpeaker distribution:")
print(df['speaker'].value_counts().head(10))


Data Summary:
Total posts: 6650
Unique speakers: 1
Date range: April 1, 2025 @ 10:23 PM ET to September 9, 2024 @ 9:59 AM ET
Posts with images: 6650
Deleted posts: 201

Speaker distribution:
speaker
Donald Trump    6650
Name: count, dtype: int64


In [13]:
# Convert date strings to datetime objects
def parse_date(date_str):
    """Parse date string format like 'January 1, 2025 @ 10:48 AM ET'"""
    try:
        # Remove the timezone part and @ symbol
        date_part = date_str.split(' @')[0]
        time_part = date_str.split('@ ')[1].replace(' ET', '')
        
        # Combine and parse
        full_date_str = f"{date_part} {time_part}"
        return pd.to_datetime(full_date_str, format='%B %d, %Y %I:%M %p')
    except:
        # Fallback for any parsing issues
        return pd.NaT

# Apply the conversion
print("Converting date strings to datetime objects...")
df['date_parsed'] = df['date'].apply(parse_date)

# Check the conversion
print("Date conversion complete!")
print(f"Successfully parsed: {df['date_parsed'].notna().sum()} out of {len(df)} dates")
print(f"Failed to parse: {df['date_parsed'].isna().sum()} dates")

# Show some examples
print("\nExample date conversions:")
print(df[['date', 'date_parsed']].head())


Converting date strings to datetime objects...
Date conversion complete!
Successfully parsed: 6650 out of 6650 dates
Failed to parse: 0 dates

Example date conversions:
                         date         date_parsed
0  July 20, 2025 @ 8:53 PM ET 2025-07-20 20:53:00
1  July 20, 2025 @ 8:16 PM ET 2025-07-20 20:16:00
2  July 20, 2025 @ 8:06 PM ET 2025-07-20 20:06:00
3  July 20, 2025 @ 8:06 PM ET 2025-07-20 20:06:00
4  July 20, 2025 @ 7:56 PM ET 2025-07-20 19:56:00


In [14]:
# Filter for posts after January 1, 2025
jan_2025 = pd.Timestamp('2025-01-01')
posts_after_jan_2025 = df[df['date_parsed'] >= jan_2025].copy()

print(f"Posts after January 1, 2025: {len(posts_after_jan_2025)}")
print(f"Date range: {posts_after_jan_2025['date_parsed'].min()} to {posts_after_jan_2025['date_parsed'].max()}")

# Sort by date (newest first)
posts_after_jan_2025 = posts_after_jan_2025.sort_values('date_parsed', ascending=False)

print(f"\n{len(posts_after_jan_2025)} posts after January 2025")


Posts after January 1, 2025: 3592
Date range: 2025-01-01 10:48:00 to 2025-07-20 20:53:00

3592 posts after January 2025


In [15]:
df.to_csv('../data/truth_social_posts_final_2025.csv', index=False)
posts_after_jan_2025.to_csv('../data/truth_social_posts_final_2025.csv', index=False)