In [8]:
import pandas as pd
import json
from datetime import datetime

# Load the JSON data
with open('truth_social_posts_final.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Loaded {len(data)} posts from JSON file")


Loaded 6550 posts from JSON file


In [9]:
# Convert JSON to DataFrame
df = pd.DataFrame(data)

print("DataFrame created successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nDataFrame info:")
df.info()


DataFrame created successfully!
Shape: (6550, 10)
Columns: ['speaker', 'handle', 'date', 'platform', 'post_url', 'image_url', 'deleted_flag', 'content_text', 'content_html', 'content_links']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6550 entries, 0 to 6549
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   speaker        6550 non-null   object
 1   handle         6550 non-null   object
 2   date           6550 non-null   object
 3   platform       6550 non-null   object
 4   post_url       6352 non-null   object
 5   image_url      6550 non-null   object
 6   deleted_flag   6550 non-null   bool  
 7   content_text   5289 non-null   object
 8   content_html   5289 non-null   object
 9   content_links  2599 non-null   object
dtypes: bool(1), object(9)
memory usage: 467.1+ KB


In [10]:
# Display first few rows of the DataFrame
print("First 5 rows:")
df.head()


First 5 rows:


Unnamed: 0,speaker,handle,date,platform,post_url,image_url,deleted_flag,content_text,content_html,content_links
0,Donald Trump,@realDonaldTrump,"July 18, 2025 @ 11:17 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,https://media-cdn.factba.se/realdonaldtrump-tr...,False,,,
1,Donald Trump,@realDonaldTrump,"July 18, 2025 @ 10:03 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,https://media-cdn.factba.se/realdonaldtrump-tr...,False,,,
2,Donald Trump,@realDonaldTrump,"July 18, 2025 @ 9:39 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,https://media-cdn.factba.se/realdonaldtrump-tr...,False,We have fulfilled so many of our promises…✅One...,"<div class=""text-sm font-medium whitespace-pre...",
3,Donald Trump,@realDonaldTrump,"July 18, 2025 @ 8:51 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,https://media-cdn.factba.se/realdonaldtrump-tr...,False,RT@realDonaldTrumpEverybody should watch Sean ...,"<div class=""text-sm font-medium whitespace-pre...",[{'url': 'https://truthsocial.com/@realDonaldT...
4,Donald Trump,@realDonaldTrump,"July 18, 2025 @ 8:46 PM ET",Truth Social,https://truthsocial.com/@realDonaldTrump/posts...,https://media-cdn.factba.se/realdonaldtrump-tr...,False,,,


In [11]:
# Basic data exploration
print("Data Summary:")
print("=============")
print(f"Total posts: {len(df)}")
print(f"Unique speakers: {df['speaker'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Posts with images: {df['image_url'].notna().sum()}")
print(f"Deleted posts: {df['deleted_flag'].sum()}")

print("\nSpeaker distribution:")
print(df['speaker'].value_counts().head(10))


Data Summary:
Total posts: 6550
Unique speakers: 1
Date range: April 1, 2025 @ 10:23 PM ET to September 9, 2024 @ 9:59 AM ET
Posts with images: 6550
Deleted posts: 199

Speaker distribution:
speaker
Donald Trump    6550
Name: count, dtype: int64


In [12]:
# Convert date strings to datetime objects
def parse_date(date_str):
    """Parse date string format like 'January 1, 2025 @ 10:48 AM ET'"""
    try:
        # Remove the timezone part and @ symbol
        date_part = date_str.split(' @')[0]
        time_part = date_str.split('@ ')[1].replace(' ET', '')
        
        # Combine and parse
        full_date_str = f"{date_part} {time_part}"
        return pd.to_datetime(full_date_str, format='%B %d, %Y %I:%M %p')
    except:
        # Fallback for any parsing issues
        return pd.NaT

# Apply the conversion
print("Converting date strings to datetime objects...")
df['date_parsed'] = df['date'].apply(parse_date)

# Check the conversion
print("Date conversion complete!")
print(f"Successfully parsed: {df['date_parsed'].notna().sum()} out of {len(df)} dates")
print(f"Failed to parse: {df['date_parsed'].isna().sum()} dates")

# Show some examples
print("\nExample date conversions:")
print(df[['date', 'date_parsed']].head())


Converting date strings to datetime objects...
Date conversion complete!
Successfully parsed: 6550 out of 6550 dates
Failed to parse: 0 dates

Example date conversions:
                          date         date_parsed
0  July 18, 2025 @ 11:17 PM ET 2025-07-18 23:17:00
1  July 18, 2025 @ 10:03 PM ET 2025-07-18 22:03:00
2   July 18, 2025 @ 9:39 PM ET 2025-07-18 21:39:00
3   July 18, 2025 @ 8:51 PM ET 2025-07-18 20:51:00
4   July 18, 2025 @ 8:46 PM ET 2025-07-18 20:46:00


In [13]:
# Filter for posts after January 1, 2025
jan_2025 = pd.Timestamp('2025-01-01')
posts_after_jan_2025 = df[df['date_parsed'] >= jan_2025].copy()

print(f"Posts after January 1, 2025: {len(posts_after_jan_2025)}")
print(f"Date range: {posts_after_jan_2025['date_parsed'].min()} to {posts_after_jan_2025['date_parsed'].max()}")

# Sort by date (newest first)
posts_after_jan_2025 = posts_after_jan_2025.sort_values('date_parsed', ascending=False)

print(f"\n{len(posts_after_jan_2025)} posts after January 2025")


Posts after January 1, 2025: 3492
Date range: 2025-01-01 10:48:00 to 2025-07-18 23:17:00

3492 posts after January 2025


In [14]:
df.to_csv('truth_social_posts_final_2025.csv', index=False)
posts_after_jan_2025.to_csv('truth_social_posts_final_2025.csv', index=False)