In [2]:
# Resolve paths from root project directory

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [6]:
import pandas as pd
from data_tools import load_crawled_terms, load_parsed_data

tweet_df = load_parsed_data('../data/14-nov/parsed_tweets.json', include_cols={
    "datastore_id",
    "timestamp",
    "retweet_count",
    "quote_count",
    "quote_tweet"
}, verbose=True, index_col="datastore_id")
recent_tweet_df = tweet_df[tweet_df.timestamp > '2020-10-23 00:00:00']
tweet_df.info()

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (1.9913108348846436 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (2.366476058959961 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (1.922652006149292 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (1.9044489860534668 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (2.027184009552002 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (1.9191839694976807 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (1.942857265472412 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (1.9039320945739746 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (2.217972993850708 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (2.2439732551574707 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.json p

In [4]:
retweet_df = load_parsed_data('../data/14-nov/parsed_retweets.json')

retweet_df.info()

Loading 8044982 json lines
(1%): 100000 lines in ../data/14-nov/parsed_retweets.json processed (0.48587703704833984 sec)
(2%): 200000 lines in ../data/14-nov/parsed_retweets.json processed (0.5202829837799072 sec)
(4%): 300000 lines in ../data/14-nov/parsed_retweets.json processed (0.44507288932800293 sec)
(5%): 400000 lines in ../data/14-nov/parsed_retweets.json processed (0.4861891269683838 sec)
(6%): 500000 lines in ../data/14-nov/parsed_retweets.json processed (0.5641098022460938 sec)
(7%): 600000 lines in ../data/14-nov/parsed_retweets.json processed (0.5244688987731934 sec)
(9%): 700000 lines in ../data/14-nov/parsed_retweets.json processed (0.43213796615600586 sec)
(10%): 800000 lines in ../data/14-nov/parsed_retweets.json processed (0.5323379039764404 sec)
(11%): 900000 lines in ../data/14-nov/parsed_retweets.json processed (0.4484238624572754 sec)
(12%): 1000000 lines in ../data/14-nov/parsed_retweets.json processed (0.4755403995513916 sec)
(14%): 1100000 lines in ../data/14-n

# How complete is our retweet graph?

Method: sampling N tweets from our dataset and checking how many retweets we have crawled

In [13]:
N =     
sampled_tweets = recent_tweet_df[recent_tweet_df["retweet_count"] > 5].sample(N)
retweeted_id_df = retweet_df["retweeted"]
retweet_coverage_stats = []

for tweet_id, tweet in sampled_tweets.iterrows():
    stats = {}
    crawled_retweet_count = 0

    for i, retweeted_tweet_id in retweeted_id_df.iteritems():
        if retweeted_tweet_id == tweet_id:
            crawled_retweet_count += 1

    stats["tweet_id"] = tweet_id
    stats["timestamp"] = tweet["timestamp"]
    stats["metadata_retweet_count"] = tweet["retweet_count"]
    stats["crawled_retweet_count"] = crawled_retweet_count

    retweet_coverage_stats.append(stats)

retweet_coverage_stats_df = pd.DataFrame(retweet_coverage_stats).sort_values('metadata_retweet_count', ascending=False)

aggregated_metadata_retweet_count = retweet_coverage_stats_df["metadata_retweet_count"].sum()
aggregated_crawled_retweet_count = retweet_coverage_stats_df["crawled_retweet_count"].sum()

print("Retweet coverage for {} samples: {:,.1f}% ({}/{})".format(
    N,
    (aggregated_crawled_retweet_count / aggregated_metadata_retweet_count) * 100,
    aggregated_crawled_retweet_count,
    aggregated_metadata_retweet_count
))

print("...for each tweet")

for i, tweet in retweet_coverage_stats_df.iterrows():
    print("[{} tweeted at {}]: {:,.1f}% ({} / {})".format(
        tweet["tweet_id"],
        tweet["timestamp"],
        tweet["crawled_retweet_count"] / tweet["metadata_retweet_count"] * 100,
        tweet["crawled_retweet_count"],
        tweet["metadata_retweet_count"]
    ))

Retweet coverage for 100 samples: 39.2% (3578/9121)
...for each tweet
[1326541495118147584 tweeted at 2020-11-11T15:05:13Z]: 31.6% (1356 / 4293)
[1326000151891226624 tweeted at 2020-11-10T03:14:06Z]: 33.3% (181 / 544)
[1325995326134611970 tweeted at 2020-11-10T02:54:56Z]: 18.2% (69 / 379)
[1327435166361542656 tweeted at 2020-11-14T02:16:20Z]: 84.3% (312 / 370)
[1324915867239370752 tweeted at 2020-11-07T03:25:33Z]: 41.8% (136 / 325)
[1326331897803382784 tweeted at 2020-11-11T01:12:21Z]: 39.9% (107 / 268)
[1325916351664443392 tweeted at 2020-11-09T21:41:07Z]: 27.4% (69 / 252)
[1326188548274393088 tweeted at 2020-11-10T15:42:43Z]: 35.2% (86 / 244)
[1325866072000630788 tweeted at 2020-11-09T18:21:19Z]: 45.3% (72 / 159)
[1324601975057559553 tweeted at 2020-11-06T06:38:15Z]: 45.8% (55 / 120)
[1324853764382199814 tweeted at 2020-11-06T23:18:46Z]: 16.4% (19 / 116)
[1326017576439173121 tweeted at 2020-11-10T04:23:21Z]: 40.4% (46 / 114)
[1326960290106400768 tweeted at 2020-11-12T18:49:21Z]: 50.5

# How complete is our quote graph?

In [12]:
import pandas as pd
N = 25
sampled_tweets = recent_tweet_df[recent_tweet_df["quote_count"] > 5].sample(N)
quoted_id_df = recent_tweet_df["quote_tweet"]
quote_coverage_stats = []

for tweet_id, tweet in sampled_tweets.iterrows():
    stats = {}
    crawled_quote_count = 0

    for i, quoted_tweet_id in quoted_id_df.iteritems():
        if quoted_tweet_id == tweet_id:
            crawled_quote_count += 1

    stats["tweet_id"] = tweet_id
    stats["timestamp"] = tweet["timestamp"]
    stats["metadata_quote_count"] = tweet["quote_count"]
    stats["crawled_quote_count"] = crawled_quote_count

    quote_coverage_stats.append(stats)

quote_coverage_stats_df = pd.DataFrame(quote_coverage_stats).sort_values('metadata_quote_count', ascending=False)

aggregated_metadata_quote_count = quote_coverage_stats_df["metadata_quote_count"].sum()
aggregated_crawled_quote_count = quote_coverage_stats_df["crawled_quote_count"].sum()

print("Quote coverage for {} samples: {:,.1f}% ({}/{})".format(
    N,
    (aggregated_crawled_quote_count / aggregated_metadata_quote_count) * 100,
    aggregated_crawled_quote_count,
    aggregated_metadata_quote_count
))

print("...for each tweet")

for i, tweet in quote_coverage_stats_df.iterrows():
    print("[{} tweeted at {}]: {:,.1f}% ({} / {})".format(
        tweet["tweet_id"],
        tweet["timestamp"],
        tweet["crawled_quote_count"] / tweet["metadata_quote_count"] * 100,
        tweet["crawled_quote_count"],
        tweet["metadata_quote_count"]
    ))

Quote coverage for 25 samples: 28.2% (207/733)
...for each tweet
[1324693763848048640 tweeted at 2020-11-06T12:42:59Z]: 3.2% (4 / 124)
[1324664882277527552 tweeted at 2020-11-06T10:48:13Z]: 25.0% (28 / 112)
[1326688751058366464 tweeted at 2020-11-12T00:50:21Z]: 44.8% (30 / 67)
[1324539507052404737 tweeted at 2020-11-06T02:30:01Z]: 19.7% (13 / 66)
[1327162423342739456 tweeted at 2020-11-13T08:12:33Z]: 95.6% (43 / 45)
[1326234783001829376 tweeted at 2020-11-10T18:46:27Z]: 15.8% (6 / 38)
[1324473283618721796 tweeted at 2020-11-05T22:06:52Z]: 2.9% (1 / 35)
[1326932678122278915 tweeted at 2020-11-12T16:59:38Z]: 0.0% (0 / 24)
[1324579439162859521 tweeted at 2020-11-06T05:08:42Z]: 0.0% (0 / 23)
[1324193453866053633 tweeted at 2020-11-05T03:34:56Z]: 23.8% (5 / 21)
[1326937320453410816 tweeted at 2020-11-12T17:18:05Z]: 63.2% (12 / 19)
[1323624355016880143 tweeted at 2020-11-03T13:53:32Z]: 94.7% (18 / 19)
[1324306587238977536 tweeted at 2020-11-05T11:04:29Z]: 33.3% (6 / 18)
[1325954928951635971 

# How complete is our retweet count?

Method: sampling 100 tweets from our datastore and comparing their retweet count with their retweet count according to the Twitter API.