In [2]:
# Resolve paths from root project directory

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [6]:
import pandas as pd
from data_tools import load_crawled_terms, load_parsed_data

tweet_df = load_parsed_data('../data/14-nov/parsed_tweets.json', include_cols={
    "datastore_id",
    "timestamp",
    "retweet_count",
    "quote_count",
    "quote_tweet"
}, verbose=True, index_col="datastore_id")
recent_tweet_df = tweet_df[tweet_df.timestamp > '2020-10-23 00:00:00']
tweet_df.info()

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (1.9913108348846436 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (2.366476058959961 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (1.922652006149292 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (1.9044489860534668 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (2.027184009552002 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (1.9191839694976807 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (1.942857265472412 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (1.9039320945739746 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (2.217972993850708 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (2.2439732551574707 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.json p

In [4]:
retweet_df = load_parsed_data('../data/14-nov/parsed_retweets.json')

retweet_df.info()

Loading 8044982 json lines
(1%): 100000 lines in ../data/14-nov/parsed_retweets.json processed (0.48587703704833984 sec)
(2%): 200000 lines in ../data/14-nov/parsed_retweets.json processed (0.5202829837799072 sec)
(4%): 300000 lines in ../data/14-nov/parsed_retweets.json processed (0.44507288932800293 sec)
(5%): 400000 lines in ../data/14-nov/parsed_retweets.json processed (0.4861891269683838 sec)
(6%): 500000 lines in ../data/14-nov/parsed_retweets.json processed (0.5641098022460938 sec)
(7%): 600000 lines in ../data/14-nov/parsed_retweets.json processed (0.5244688987731934 sec)
(9%): 700000 lines in ../data/14-nov/parsed_retweets.json processed (0.43213796615600586 sec)
(10%): 800000 lines in ../data/14-nov/parsed_retweets.json processed (0.5323379039764404 sec)
(11%): 900000 lines in ../data/14-nov/parsed_retweets.json processed (0.4484238624572754 sec)
(12%): 1000000 lines in ../data/14-nov/parsed_retweets.json processed (0.4755403995513916 sec)
(14%): 1100000 lines in ../data/14-n

# How complete is our retweet graph?

Method: sampling N tweets from our dataset and checking how many retweets we have crawled

In [32]:
N = 50
sampled_tweets = recent_tweet_df[recent_tweet_df["retweet_count"] > 5].sample(N)
retweeted_id_df = retweet_df["retweeted"]
retweet_coverage_stats = []

for tweet_id, tweet in sampled_tweets.iterrows():
    stats = {}
    crawled_retweet_count = 0

    for i, retweeted_tweet_id in retweeted_id_df.iteritems():
        if retweeted_tweet_id == tweet_id:
            crawled_retweet_count += 1

    stats["tweet_id"] = tweet_id
    stats["timestamp"] = tweet["timestamp"]
    stats["metadata_retweet_count"] = tweet["retweet_count"]
    stats["crawled_retweet_count"] = crawled_retweet_count

    retweet_coverage_stats.append(stats)

retweet_coverage_stats_df = pd.DataFrame(retweet_coverage_stats).sort_values('metadata_retweet_count', ascending=False)

aggregated_metadata_retweet_count = retweet_coverage_stats_df["metadata_retweet_count"].sum()
aggregated_crawled_retweet_count = retweet_coverage_stats_df["crawled_retweet_count"].sum()

print("Retweet coverage for {} samples: {:,.1f}% ({}/{})".format(
    N,
    (aggregated_crawled_retweet_count / aggregated_metadata_retweet_count) * 100,
    aggregated_crawled_retweet_count,
    aggregated_metadata_retweet_count
))

print("...for each tweet")

for i, tweet in retweet_coverage_stats_df.iterrows():
    print("[{} tweeted at {}]: {:,.1f}% ({} / {})".format(
        tweet["tweet_id"],
        tweet["timestamp"],
        tweet["crawled_retweet_count"] / tweet["metadata_retweet_count"] * 100,
        tweet["crawled_retweet_count"],
        tweet["metadata_retweet_count"]
    ))

Retweet coverage for 50 samples: 45.3% (2381/5252)
...for each tweet
[1325798366694010882 tweeted at 2020-11-09T13:52:17Z]: 45.6% (675 / 1480)
[1324020755768791040 tweeted at 2020-11-04T16:08:41Z]: 36.4% (514 / 1413)
[1326698096059846656 tweeted at 2020-11-12T01:27:29Z]: 76.2% (205 / 269)
[1326263772705517569 tweeted at 2020-11-10T20:41:38Z]: 30.6% (82 / 268)
[1325046856309637120 tweeted at 2020-11-07T12:06:03Z]: 33.6% (73 / 217)
[1327088479411589120 tweeted at 2020-11-13T03:18:44Z]: 73.0% (143 / 196)
[1324614148907540480 tweeted at 2020-11-06T07:26:37Z]: 59.9% (85 / 142)
[1324027372971188235 tweeted at 2020-11-04T16:34:59Z]: 34.0% (48 / 141)
[1325347051144667136 tweeted at 2020-11-08T07:58:55Z]: 66.4% (83 / 125)
[1326156206285746179 tweeted at 2020-11-10T13:34:13Z]: 20.0% (15 / 75)
[1326188900470190081 tweeted at 2020-11-10T15:44:07Z]: 19.2% (14 / 73)
[1325134916078587906 tweeted at 2020-11-07T17:55:58Z]: 43.8% (28 / 64)
[1324543390269263873 tweeted at 2020-11-06T02:45:27Z]: 20.6% (13

# How complete is our quote graph?

In [11]:
import pandas as pd
N = 25
sampled_tweets = recent_tweet_df[recent_tweet_df["quote_count"] > 5].sample(N)
quoted_id_df = recent_tweet_df["quote_tweet"]
quote_coverage_stats = []

for tweet_id, tweet in sampled_tweets.iterrows():
    stats = {}
    crawled_quote_count = 0

    for i, quoted_tweet_id in quoted_id_df.iteritems():
        if quoted_tweet_id == tweet_id:
            crawled_quote_count += 1

    stats["tweet_id"] = tweet_id
    stats["timestamp"] = tweet["timestamp"]
    stats["metadata_quote_count"] = tweet["quote_count"]
    stats["crawled_quote_count"] = crawled_quote_count

    quote_coverage_stats.append(stats)

quote_coverage_stats_df = pd.DataFrame(quote_coverage_stats).sort_values('metadata_quote_count', ascending=False)

aggregated_metadata_quote_count = quote_coverage_stats_df["metadata_quote_count"].sum()
aggregated_crawled_quote_count = quote_coverage_stats_df["crawled_quote_count"].sum()

print("Quote coverage for {} samples: {:,.1f}% ({}/{})".format(
    N,
    (aggregated_crawled_quote_count / aggregated_metadata_quote_count) * 100,
    aggregated_crawled_quote_count,
    aggregated_metadata_quote_count
))

print("...for each tweet")

for i, tweet in quote_coverage_stats_df.iterrows():
    print("[{} tweeted at {}]: {:,.1f}% ({} / {})".format(
        tweet["tweet_id"],
        tweet["timestamp"],
        tweet["crawled_quote_count"] / tweet["metadata_quote_count"] * 100,
        tweet["crawled_quote_count"],
        tweet["metadata_quote_count"]
    ))

Quote coverage for 25 samples: 42.5% (657/1545)
...for each tweet
[1324308133888593920 tweeted at 2020-11-05T11:10:38Z]: 28.0% (162 / 578)
[1325513854676242432 tweeted at 2020-11-08T19:01:44Z]: 124.7% (192 / 154)
[1323980770931806210 tweeted at 2020-11-04T13:29:48Z]: 57.3% (86 / 150)
[1325261660735614983 tweeted at 2020-11-08T02:19:36Z]: 51.9% (54 / 104)
[1325571200295251968 tweeted at 2020-11-08T22:49:36Z]: 32.2% (28 / 87)
[1324891059147714564 tweeted at 2020-11-07T01:46:58Z]: 1.5% (1 / 65)
[1324166447011082240 tweeted at 2020-11-05T01:47:37Z]: 45.5% (25 / 55)
[1324183965683515393 tweeted at 2020-11-05T02:57:14Z]: 37.5% (18 / 48)
[1324465010882478080 tweeted at 2020-11-05T21:34:00Z]: 21.6% (8 / 37)
[1325869668645564418 tweeted at 2020-11-09T18:35:37Z]: 55.6% (20 / 36)
[1326494444045086720 tweeted at 2020-11-11T11:58:15Z]: 0.0% (0 / 32)
[1325977970675232768 tweeted at 2020-11-10T01:45:58Z]: 0.0% (0 / 25)
[1325931988503703552 tweeted at 2020-11-09T22:43:15Z]: 48.0% (12 / 25)
[1324522762

# How complete is our retweet count?

Method: sampling 100 tweets from our datastore and comparing their retweet count with their retweet count according to the Twitter API.