In [1]:
# Resolve paths from root project directory

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from data_tools import load_parsed_data

In [90]:
# Load our tweets
cast_cols = {
    "tweet_count": "int32",
    "quote_count": "int32" 
}

from data_tools import load_crawled_terms
crawled_terms = load_crawled_terms("../keywords-3nov.txt")

for term in crawled_terms:
    cast_cols[term.lower()] = "Sparse[int8]"

tweet_df = load_parsed_data('../data/14-nov/parsed_tweets.json', exclude_cols={
    "cleaned_text", 
    "entities",
    "replyTo",
    "replyTo_user",
    "text", 
    "last_retweeted", 
    "place", 
    "processed",
    "media", 
    "isDeleted"
}, cast_cols=cast_cols, verbose=True, index_col="datastore_id")
retweet_df = load_parsed_data('../data/14-nov/parsed_retweets.json')

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (1.5652921199798584 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (1.6202399730682373 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (1.5936200618743896 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (1.4874870777130127 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (1.616062879562378 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (1.5352709293365479 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (1.5981290340423584 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (9.876200914382935 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (1.6165392398834229 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (1.50541090965271 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.json 

In [175]:
import gzip
import json

hydrated_tweet_dir = './coverage-test/2020-11-06/'
hydrated_tweets = [
    'us-presidential-tweet-id-2020-11-06-13.jsonl.gz',
    'us-presidential-tweet-id-2020-11-06-07.jsonl.gz',
    'us-presidential-tweet-id-2020-11-06-06.jsonl.gz',
    'us-presidential-tweet-id-2020-11-06-12.jsonl.gz'
]

line_count = 0
tweet_count = 0
retweet_count = 0
matching_tweets = []
matching_retweets = []

for filename in hydrated_tweets:
    print("Processing {}".format(filename))
    with gzip.open(hydrated_tweet_dir + filename) as zipfile:
        for line in zipfile:
            tweet = json.loads(line)
            if 'retweeted_status' in tweet:
                retweet_count += 1
            else: 
                tweet_count += 1
            for term in crawled_terms:
                if (term.lower() in tweet['full_text'].lower()):
                    if 'retweeted_status' in tweet:
                        matching_retweets.append(tweet)
                    else: 
                        matching_tweets.append(tweet)
                    break
            line_count += 1
            if (line_count % 25000 == 0):
                print("Processed {} lines ({} tweets / {} retweets)".format(line_count, tweet_count, retweet_count))
    print("Total crawled terms found: {} tweets / {} retweets".format(len(matching_tweets), len(matching_retweets)))

Processing us-presidential-tweet-id-2020-11-06-13.jsonl.gz
Processed 25000 lines (7109 tweets / 17891 retweets)
Processed 50000 lines (14215 tweets / 35785 retweets)
Processed 75000 lines (21543 tweets / 53457 retweets)
Processed 100000 lines (28607 tweets / 71393 retweets)
Processed 125000 lines (36018 tweets / 88982 retweets)
Processed 150000 lines (43456 tweets / 106544 retweets)
Processed 175000 lines (51474 tweets / 123526 retweets)
Total crawled terms found: 1777 tweets / 6782 retweets
Processing us-presidential-tweet-id-2020-11-06-07.jsonl.gz
Processed 200000 lines (59856 tweets / 140144 retweets)
Processed 225000 lines (66951 tweets / 158049 retweets)
Processed 250000 lines (74068 tweets / 175932 retweets)
Processed 275000 lines (80356 tweets / 194644 retweets)
Processed 300000 lines (86519 tweets / 213481 retweets)
Processed 325000 lines (93934 tweets / 231066 retweets)
Processed 350000 lines (101501 tweets / 248499 retweets)
Processed 375000 lines (109277 tweets / 265723 retw

In [117]:
# build lookup table for retweets
from collections import defaultdict 
retweets_by_user = defaultdict(lambda: set())

for retweet in retweet_df.itertuples():
    retweets_by_user[retweet.user].add(retweet.retweeted)

In [140]:
missing_tweets = []
missing_retweets = []

for tweet in matching_tweets:
    if (str(tweet['id']) not in tweet_df.index):
        missing_tweets.append(tweet)

for retweet in matching_retweets:
    retweeted_tweet = retweet['retweeted_status']
    user_id = str(retweet['user']['id'])
    if (str(retweeted_tweet['id']) not in retweets_by_user[user_id]):
        missing_retweets.append(retweet)

In [183]:
def print_missing_stats(matching, missing, stats_type):
    term_stats = defaultdict(lambda: set())
    print("Missing {}/{} {} ({:,.1f}%)".format(
        len(missing),
        len(matching),
        stats_type,
        (len(missing) / len(matching)) * 100
    ))

    for tweet in missing:
        for term in crawled_terms:
            if term.lower() in tweet['full_text'].lower():
                term_stats[term.lower()].add(tweet['full_text'])
    
    for (term, term_set) in sorted(term_stats.items(), key=lambda x: -len(x[1])):
        if (len(term_set) > 0):
            print("Missed {} for term: '{}'".format(
                len(term_set),
                term
            ))      

print_missing_stats(matching_tweets, missing_tweets, 'tweets')

Missing 2792/6199 tweets (45.0%)
Missed 1694 for term: 'voter fraud'
Missed 314 for term: '#stopthesteal'
Missed 289 for term: '#voterfraud'
Missed 253 for term: 'election fraud'
Missed 83 for term: '#electionfraud'
Missed 76 for term: 'ballot harvesting'
Missed 64 for term: 'ballot fraud'
Missed 39 for term: 'election interference'
Missed 26 for term: 'democrats cheat'
Missed 15 for term: '#electioninterference'
Missed 14 for term: '#ballotharvesting'
Missed 14 for term: '#cheatingdemocrats'
Missed 10 for term: 'stolen ballots'
Missed 10 for term: 'election tampering'
Missed 5 for term: 'cheating democrats'
Missed 5 for term: '#ballotfraud'
Missed 4 for term: '#democratvoterfraud'
Missed 2 for term: '#voterfraudisreal'
Missed 2 for term: '#stopvoterfraud'
Missed 1 for term: 'discarded ballots'
Missed 1 for term: '#electiontampering'
Missed 1 for term: 'harvest ballot'


In [185]:
print_missing_stats(matching_retweets, missing_retweets, 'retweets')

Missing 11424/24572 retweets (46.5%)
Missed 1053 for term: 'voter fraud'
Missed 171 for term: 'election fraud'
Missed 158 for term: '#voterfraud'
Missed 133 for term: '#stopthesteal'
Missed 36 for term: 'ballot fraud'
Missed 36 for term: 'ballot harvesting'
Missed 30 for term: '#electionfraud'
Missed 15 for term: 'election interference'
Missed 10 for term: 'democrats cheat'
Missed 8 for term: 'stolen ballots'
Missed 7 for term: '#electioninterference'
Missed 6 for term: '#ballotharvesting'
Missed 4 for term: '#voterfraudisreal'
Missed 4 for term: 'election tampering'
Missed 3 for term: 'discarded ballots'
Missed 3 for term: '#cheatingdemocrats'
Missed 2 for term: '#ballotfraud'
Missed 1 for term: '#mailinvoterfraud'
Missed 1 for term: 'cheating democrats'
Missed 1 for term: '#stopvoterfraud'


### Random sample of missing tweets/retweets



In [179]:
import numpy as np

print("-- Missing tweets --")
for tweet in np.random.choice(missing_tweets, 3):
    print("[{}]: {}".format(tweet['id'], tweet['full_text']))
    print()

print()

print("-- Missing retweets --")
for retweet in np.random.choice(missing_retweets, 5):
    print("[{}]: {}".format(retweet['id'], retweet['full_text']))
    print()

-- Missing tweets --
[1324594699265454088]: The MSM won't show you but this is why Biden never left his basement, he never needed to campaign because the election was hoax #stopthesteal #TRUMPWILLTRIUMPH #voterfraud https://t.co/5kRly10MZw

[1324709102669438981]: @caseycoley5 @EricTrump No, Biden did not brag about committing voter fraud https://t.co/uSj9kMQzQ5

[1324617620935593987]: VOTER FRAUD https://t.co/EWhSrhhrjk


-- Missing retweets --
[1324604593888702464]: RT @Wizard_Predicts: 🚨BREAKING: DOJ Arrests U.S. Postal Worker Caught at Canadian Border With Stolen Ballots In Car Trunk. https://t.co/S2Z…

[1324598872572891136]: RT @BuCap004: Pastor Torah Grace proves voter fraud, shares her experience and provides evidence:

Part 1 https://t.co/EPzAJkvAcT

[1324702635379871745]: RT @Al_Sanchino: It’s wild Trump supporters believe there’s voter fraud with 0 evidence but couldn’t believe he sexually assaulted any wome…

[1324707245272100864]: RT @mimzybug: Ballot harvesting, a habit of 