A DuckDB-based tool for analyzing your Twitter archive data.
- Python 3.7 or higher
pipandvenvmodules available- A Twitter archive exported in the standard format (JavaScript files with
window.YTD.*prefix)
python -m venv venv
source venv/bin/activate # On macOS/Linux
# or: venv\Scripts\activate # On Windowspip install -r requirements.txtpython -c "import duckdb; conn = duckdb.connect('twitter.db'); conn.execute(open('schema.sql').read())"Ensure your archive is extracted to archive/data/. The directory should contain files like:
tweets.jslike.jsfollower.jsfollowing.jsaccount.js
python import_tweets.pyThis imports tweets, users, hashtags, mentions, URLs, and media metadata.
python import_metadata.pyAfter importing, verify the data was loaded correctly:
python -c "
import duckdb
conn = duckdb.connect('twitter.db')
result = conn.execute('''
SELECT 'tweets' as table_name, COUNT(*) as count FROM tweets
UNION ALL SELECT 'users', COUNT(*) FROM users
UNION ALL SELECT 'mentions', COUNT(*) FROM mentions
UNION ALL SELECT 'hashtags', COUNT(*) FROM hashtags
UNION ALL SELECT 'likes', COUNT(*) FROM likes
''').fetchall()
for row in result:
print(f'{row[0]}: {row[1]}')"SELECT user_id, screen_name, COUNT(*) as mention_count
FROM mentions
GROUP BY user_id, screen_name
ORDER BY mention_count DESC
LIMIT 10;SELECT hashtag, COUNT(*) as usage_count
FROM hashtags
GROUP BY hashtag
ORDER BY usage_count DESC
LIMIT 10;SELECT strftime(created_at, '%Y-%m') as month,
SUM(favorite_count) as total_favorites,
SUM(retweet_count) as total_retweets
FROM tweets
GROUP BY month
ORDER BY month;SELECT EXTRACT(HOUR FROM created_at) as hour,
COUNT(*) as tweet_count
FROM tweets
GROUP BY hour
ORDER BY tweet_count DESC;SELECT lang, COUNT(*) as count
FROM tweets
GROUP BY lang
ORDER BY count DESC;SELECT t.tweet_id, t.full_text, t.in_reply_to_status_id
FROM tweets t
WHERE t.in_reply_to_status_id IS NOT NULL
LIMIT 20;SELECT media_type, COUNT(*) as count
FROM media
GROUP BY media_type
ORDER BY count DESC;Use the DuckDB CLI or Python:
# DuckDB CLI
duckdb twitter.db
# Python
python -c "
import duckdb
conn = duckdb.connect('twitter.db')
result = conn.execute('SELECT COUNT(*) FROM tweets').fetchone()
print(f'Total tweets: {result[0]}')"schema.sql- Database schema with all 10 tablesimport_tweets.py- Imports tweets and related entitiesimport_metadata.py- Imports likes, followers, following, accountrequirements.txt- Python dependencies (duckdb)twitter.db- Generated database (after running import scripts)