In [13]:
from notebook_utils import setup
import pandas as pd
import networkx as nx
from collections import defaultdict
import numpy as np
import pickle

setup()
DATAFRAMES_DIR = "../data/dataframes/16-dec/"

SAMPLE_SIZE = 100

def extract_month(timestamp):
    full = timestamp[:7]
    year = full[:4]
    month = full[5:]
    return month + "-" + year

def extract_month_from_date(date):
    return date[:7]


def extract_date(timestamp):
    date = timestamp[:10]
    return date

EXPORT_CHUNK_SIZE = 500000

def split_dataframe(df, chunk_size = EXPORT_CHUNK_SIZE): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

def map_user_status(status):
    if (status == "inactive"):
        return "deleted"
    else:
        return status

rename_dict = {}
for i in range(5):
    rename_dict["retweets_by_cluster_" + str(i)] = "retweet_count_by_community_" + str(i)
    rename_dict["quotes_by_cluster_" + str(i)] = "quote_count_by_community_" + str(i)
    rename_dict["tweets_by_cluster_" + str(i)] = "tweet_count_by_community_" + str(i)
rename_dict["tweets_by_suspended"] = "tweet_count_by_suspended_users"
rename_dict["quotes_by_suspended"] = "quote_count_by_suspended_users"
rename_dict["retweets_by_suspended"] = "retweet_count_by_suspended_users"
rename_dict["author_active_status"] = "user_active_status" 
rename_dict["l_closeness"] = "closeness_centrality_detractor_cluster"
rename_dict["r_closeness"] = "closeness_centrality_promoter_cluster"
rename_dict["quote_count"] = "quote_count_metadata"
rename_dict["retweet_count"] = "retweet_count_metadata"
rename_dict["cluster"] = "user_community"
rename_dict["user_cluster"] = "user_community"
rename_dict["active_status"] = "user_active_status"


## Exception for user
user_rename_dict = {**rename_dict}
for i in range(5):
    user_rename_dict["retweets_by_cluster_" + str(i)] = "retweet_count_by_community_" + str(i)
    user_rename_dict["quotes_by_cluster_" + str(i)] = "quote_count_by_community_" + str(i)
    user_rename_dict["tweets_by_cluster_" + str(i)] = "tweet_count_by_community_" + str(i)
user_rename_dict["retweets_by_suspended"] = "retweet_count_by_suspended_users"
user_rename_dict["quotes_by_suspended"] = "quote_count_by_suspended_users"

user_export_cols = ["user_community", "user_active_status", "closeness_centrality_detractor_cluster", "closeness_centrality_promoter_cluster"]

retweeted_cols = []
for i in range(5):
    retweeted_cols.append("retweet_count_by_community_" + str(i))
    retweeted_cols.append("quote_count_by_community_" + str(i))
retweeted_cols.append("retweet_count_by_suspended_users")
retweeted_cols.append("quote_count_by_suspended_users")
user_export_cols += retweeted_cols

media_share_cols = ["tweet_count", "retweet_count_metadata", "quote_count_metadata"]
for i in range(5):
    media_share_cols.append("tweet_count_by_community_" + str(i))
    media_share_cols.append("retweet_count_by_community_" + str(i))
    media_share_cols.append("quote_count_by_community_" + str(i))
media_share_cols.append("tweet_count_by_suspended_users")
media_share_cols.append("retweet_count_by_suspended_users")
media_share_cols.append("quote_count_by_suspended_users")


tweet_cols = ["user_community", "user_active_status", "retweet_count_metadata", "quote_count_metadata"]

image_cols = ["unique_id", "tweet_id", "a_hash", "p_hash", "w_hash"]
youtube_cols = ["video_title", "video_description", "channel_id", "channel_title", "published_at"] + media_share_cols 
url_cols = ["domain"] + media_share_cols

In [3]:
df_users.columns

Index(['created_at', 'friends_count', 'name', 'verified', 'followers_count',
       'location', 'handle', 'data_source', 'active_status', 'cluster',
       'l_closeness', 'r_closeness', 'retweets_by_cluster_0',
       'quotes_by_cluster_0', 'retweets_by_cluster_1', 'quotes_by_cluster_1',
       'retweets_by_cluster_2', 'quotes_by_cluster_2', 'retweets_by_cluster_3',
       'quotes_by_cluster_3', 'retweets_by_cluster_4', 'quotes_by_cluster_4',
       'retweets_by_suspended', 'quotes_by_suspended', 'retweets_crawled',
       'quotes_crawled', 'retweets_by_l', 'retweets_by_r',
       'retweets_suspended_ratio'],
      dtype='object')

In [46]:
user_rename_dict

{'retweets_by_cluster_0': 'retweet_count_by_community_0',
 'quotes_by_cluster_0': 'quote_count_by_community_0',
 'tweets_by_cluster_0': 'tweet_count_by_community_0',
 'retweets_by_cluster_1': 'retweet_count_by_community_1',
 'quotes_by_cluster_1': 'quote_count_by_community_1',
 'tweets_by_cluster_1': 'tweet_count_by_community_1',
 'retweets_by_cluster_2': 'retweet_count_by_community_2',
 'quotes_by_cluster_2': 'quote_count_by_community_2',
 'tweets_by_cluster_2': 'tweet_count_by_community_2',
 'retweets_by_cluster_3': 'retweet_count_by_community_3',
 'quotes_by_cluster_3': 'quote_count_by_community_3',
 'tweets_by_cluster_3': 'tweet_count_by_community_3',
 'retweets_by_cluster_4': 'retweet_count_by_community_4',
 'quotes_by_cluster_4': 'quote_count_by_community_4',
 'tweets_by_cluster_4': 'tweet_count_by_community_4',
 'tweets_by_suspended': 'tweet_count_suspended_users',
 'quotes_by_suspended': 'quote_count_by_suspended_users',
 'retweets_by_suspended': 'retweet_count_by_suspended_use

In [3]:
df_retweets = pd.read_pickle("./df_retweets_with_cluster.pickle")
df_retweets.shape[0]

25566698

In [2]:
df_recent_tweets = pd.read_pickle("./df_recent_tweets_with_final_metrics.pickle")
df_recent_tweets.shape[0]

7603103

In [2]:
df_users = pd.read_pickle("./df_users_final_with_metrics.pickle")
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2559018 entries, 3288305608 to 1313243498
Data columns (total 29 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   created_at                object 
 1   friends_count             Int64  
 2   name                      object 
 3   verified                  object 
 4   followers_count           Int64  
 5   location                  object 
 6   handle                    object 
 7   data_source               object 
 8   active_status             object 
 9   cluster                   Int64  
 10  l_closeness               float64
 11  r_closeness               float64
 12  retweets_by_cluster_0     int64  
 13  quotes_by_cluster_0       int64  
 14  retweets_by_cluster_1     int64  
 15  quotes_by_cluster_1       int64  
 16  retweets_by_cluster_2     int64  
 17  quotes_by_cluster_2       int64  
 18  retweets_by_cluster_3     int64  
 19  quotes_by_cluster_3       int64  
 20  retweets_by_clust

In [9]:
#df_users = pd.read_pickle("./df_users_final_with_metrics.pickle")
#df_users["active_status"] = df_users.apply(lambda u: map_user_status(u["active_status"]), axis=1)
df_users["active_status"].value_counts()

active       2343266
deleted       115868
suspended      99884
Name: active_status, dtype: int64

In [10]:
for cluster in range(5):
    cluster_users = df_users[df_users["cluster"] == cluster]
    print("Community {}".format(cluster))
    print("Number of users: {}".format(cluster_users.shape[0]))

    for user_id, handle, active_status, retweets in cluster_users.nlargest(10, "retweets_crawled")[["handle", "active_status", "retweets_crawled"]].itertuples():
        print("{} & {} & {} & {:,}\\\\".format(user_id, handle.replace("_", "\\_"), active_status, retweets))

Community 0
Number of users: 860976
32871086 & kylegriffin1 & active & 76,302\\
1640929196 & mmpadellan & active & 74,393\\
255812611 & donwinslow & active & 69,796\\
216776631 & BernieSanders & active & 60,961\\
15952856 & AriBerman & active & 58,222\\
3622368202 & JohnFetterman & active & 54,200\\
341190477 & TheRickyDavila & active & 51,117\\
148529707 & RBReich & active & 42,324\\
22129280 & jimsciutto & active & 41,438\\
1416374742 & Al\_Sanchino & active & 38,179\\
Community 1
Number of users: 437783
25073877 & realDonaldTrump & suspended & 1,560,373\\
187680645 & LLinWood & suspended & 1,057,805\\
586707638 & SidneyPowell1 & suspended & 633,273\\
240454812 & GenFlynn & suspended & 334,197\\
1812055789 & CodeMonkeyZ & suspended & 274,210\\
292929271 & charliekirk11 & active & 260,467\\
16989178 & JamesOKeefeIII & active & 253,221\\
770781940341288960 & RudyGiuliani & active & 238,842\\
18266688 & TomFitton & active & 227,569\\
875856268056969216 & DC\_Draino & active & 211,189\\


In [12]:
def count_active_status (df, col="active_status"):
    total = df.shape[0]
    print("Total {:,}".format(total))
    for status, val in df["active_status"].value_counts(dropna=False).to_dict().items():
        print(status, ("{:,} {:,.2f}%").format(val, (val / total) * 100))

count_active_status(df_users)

Total 2,559,018
active 2,343,266 91.57%
inactive 115,868 4.53%
suspended 99,884 3.90%


In [17]:
promoter_users = df_users[(df_users["cluster"] == 1) | (df_users["cluster"] == 2) | (df_users["cluster"] == 3) | (df_users["cluster"] == 4)]
detractor_users = df_users[df_users["cluster"] == 0]

print("Promoters")
count_active_status(promoter_users)

print("Detractors")
count_active_status(detractor_users)


print("Promoters suspended X times as many as detractors")
print(promoter_users[promoter_users["active_status"] == "suspended"].shape[0] / detractor_users[detractor_users["active_status"] == "suspended"].shape[0])

Promoters
Total 836,968
active 700,648 83.71%
inactive 71,051 8.49%
suspended 65,269 7.80%
Detractors
Total 860,976
active 846,111 98.27%
suspended 8,659 1.01%
inactive 6,206 0.72%
Promoters suspended X times as many as detractors
7.537706432613466


In [20]:
community_2_users = df_users[(df_users["cluster"] == 2)]

print("Community 2")
count_active_status(community_2_users)

print("Community 2 suspended of all suspended")
print(community_2_users[community_2_users.active_status == "suspended"].shape[0] / df_users[df_users.active_status == "suspended"].shape[0])

Community 2
Total 342,184
active 266,332 77.83%
suspended 46,031 13.45%
inactive 29,821 8.71%
Community 2 suspended of all suspended
0.46084457971246645


In [4]:
quote_tweet_col = pd.read_pickle("./df_recent_tweets_with_cluster.pickle")["quote_tweet"]
print("Total", len(quote_tweet_col))
print("OG tweets:", len(quote_tweet_col) - quote_tweet_col.count())
print("Quote tweets:", quote_tweet_col.count())


Total 7603103
OG tweets: 3781524
Quote tweets: 3821579


In [None]:
df_users.head()

## Aggregated DFs

In [2]:
import pickle 
with open("./content_shares_by_cluster.pickle", "rb") as f:
    content_shares_by_cluster = pickle.load(f)

In [3]:
import json
with open("./data_export/url_stats/16-dec/expanded_url_map.json") as json_file:
    expanded_url_map = json.load(json_file)

In [4]:
from urllib.parse import urlparse, parse_qs

def get_domain(url):
    parsed = urlparse(url)

    return parsed.netloc.replace("www.", "").lower()

url_map_with_shares = {}
for url, data in content_shares_by_cluster["url"].items():
    if "twitter.com/" not in url:
        expanded_url_data = expanded_url_map[url]["expanded_url"]
        if "url" in expanded_url_data:
            url = expanded_url_data["url"]
            if "google.com/sorry" in url:
                parsed_url = urlparse(url)
                url = parse_qs(parsed_url.query)["continue"][0]

        data["domain"] = get_domain(url)
        url_map_with_shares[url] = data

In [5]:
import json
youtube_data = []
with open("./data_export/url_stats/16-dec/youtube_data_details.json") as json_file:
    for line in json_file:
        youtube_data.append(json.loads(line))

youtube_data_by_id = {}
for yt_video in youtube_data:
  youtube_data_by_id[yt_video['id']] = yt_video

In [6]:
from urllib.parse import urlparse, parse_qs

def get_video_id(url):
    """
    Examples:
    - http://youtu.be/SA2iWivDJiE
    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    - http://www.youtube.com/embed/SA2iWivDJiE
    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
    """
    o = urlparse(url)
    try:
        if o.netloc == 'youtu.be':
            return o.path[1:]
        elif o.netloc in ('www.youtube.com', 'youtube.com'):
            if o.path == '/watch':
                id_index = o.query.index('v=')
                return o.query[id_index+2:id_index+13]
            elif o.path[:7] == '/embed/':
                return o.path.split('/')[2]
            elif o.path[:3] == '/v/':
                return o.path.split('/')[2]
    except:
        pass
    return None

In [7]:
youtube_map_with_shares = {}
missing_youtube_ids = set()

for url, data in url_map_with_shares.items():
    video_id = get_video_id(url)
    if (video_id is not None):
        if (video_id in youtube_data_by_id):
            if (video_id in youtube_map_with_shares):
                for col, val in data.items():
                    if col != "domain":
                        youtube_map_with_shares[video_id][col] += val
            else:
                youtube_data = youtube_data_by_id[video_id]
                youtube_share_data = data.copy()
                del youtube_share_data["domain"]
                youtube_share_data["video_title"] = youtube_data["snippet"]["title"]
                youtube_share_data["video_description"] = youtube_data["snippet"]["description"]
                youtube_share_data["channel_title"] = youtube_data["snippet"]["channelTitle"]
                youtube_share_data["channel_id"] = youtube_data["snippet"]["channelId"]
                youtube_share_data["published_at"] = youtube_data["snippet"]["publishedAt"]
                youtube_map_with_shares[video_id] = youtube_share_data
        else:
            missing_youtube_ids.add(video_id)

print("Missing data", len(missing_youtube_ids))

Missing data 1649


In [14]:
df_hashtag = pd.DataFrame.from_dict(content_shares_by_cluster["hashtag"]).T.sort_values("retweet_count", ascending=False).rename(columns=rename_dict)[media_share_cols]
df_url = pd.DataFrame.from_dict(url_map_with_shares).T.sort_values("retweet_count", ascending=False).rename(columns=rename_dict)[url_cols]
df_youtube = pd.DataFrame.from_dict(youtube_map_with_shares).T.sort_values("retweet_count", ascending=False).rename(columns=rename_dict)[youtube_cols]

In [15]:
df_url.head()

Unnamed: 0,domain,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_by_community_0,retweet_count_by_community_0,quote_count_by_community_0,tweet_count_by_community_1,retweet_count_by_community_1,quote_count_by_community_1,...,quote_count_by_community_2,tweet_count_by_community_3,retweet_count_by_community_3,quote_count_by_community_3,tweet_count_by_community_4,retweet_count_by_community_4,quote_count_by_community_4,tweet_count_by_suspended_users,retweet_count_by_suspended_users,quote_count_by_suspended_users
https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp,foxnews.com,264,41078,6543,14,61,401,27,3250,264,...,966,4,345,39,8,483,34,62,2555,305
https://www.legislationline.org/download/id/1472/file/3b50795b2d0374cbef5c29766256.pdf,legislationline.org,148,19221,1973,0,24,92,30,2021,161,...,509,0,25,2,0,119,8,26,1793,139
https://www.houstonchronicle.com/politics/texas/article/Texas-Lt-Gov-Dan-Patrick-offers-1-million-15716973.php?utm_campaign=CMS%20Sharing%20Tools%20(Premium)&utm_source=t.co&utm_medium=referral,houstonchronicle.com,56,16627,2602,13,103,272,8,1037,77,...,345,3,74,15,0,67,4,3,1425,106
https://nypost.com/2020/11/11/usps-whistleblower-denies-wapo-claim-he-recanted-allegations/?utm_source=twitter_sitebuttons&utm_medium=site%20buttons&utm_campaign=site%20buttons,nypost.com,85,15297,983,3,13,23,11,648,36,...,224,0,35,1,0,41,1,12,1124,69
https://thetexan.news/limestone-county-individual-charged-with-134-counts-of-voter-fraud-attorney-general-announces/,thetexan.news,280,14525,1459,2,13,21,41,1368,109,...,419,0,17,3,1,102,6,30,1385,138


In [19]:
df_youtube.to_csv("./data_export/final/youtube_videos.csv", index_label="video_id")
df_hashtag.to_csv("./data_export/final/hashtags.csv", index_label="hashtag")
df_url.to_csv("./data_export/final/urls.csv", index_label="url")

In [20]:
df_youtube[:SAMPLE_SIZE].to_csv("./data_export/final/samples/youtube_videos-sample.csv", index_label="video_id")
df_hashtag[:SAMPLE_SIZE].to_csv("./data_export/final/samples/hashtags-sample.csv", index_label="hashtag")
df_url[:SAMPLE_SIZE].to_csv("./data_export/final/samples/urls-sample.csv", index_label="url")

In [21]:
print(df_hashtag.info())
print(df_url.info())
print(df_youtube.info())

<class 'pandas.core.frame.DataFrame'>
Index: 174604 entries, stopthesteal to fuckclinton
Data columns (total 21 columns):
 #   Column                            Non-Null Count   Dtype
---  ------                            --------------   -----
 0   tweet_count                       174604 non-null  int64
 1   retweet_count_metadata            174604 non-null  int64
 2   quote_count_metadata              174604 non-null  int64
 3   tweet_count_by_community_0        174604 non-null  int64
 4   retweet_count_by_community_0      174604 non-null  int64
 5   quote_count_by_community_0        174604 non-null  int64
 6   tweet_count_by_community_1        174604 non-null  int64
 7   retweet_count_by_community_1      174604 non-null  int64
 8   quote_count_by_community_1        174604 non-null  int64
 9   tweet_count_by_community_2        174604 non-null  int64
 10  retweet_count_by_community_2      174604 non-null  int64
 11  quote_count_by_community_2        174604 non-null  int64
 12  tweet

In [36]:
df_url.sort_values("retweets_by_suspended", ascending=False)[:10]

Unnamed: 0,tweet_count,quote_count,retweet_count,retweets_by_suspended,quotes_by_suspended,tweets_by_suspended,quotes_crawled,retweets_crawled,retweets_by_cluster_0,retweets_by_cluster_1,...,quotes_by_cluster_1,quotes_by_cluster_2,quotes_by_cluster_3,quotes_by_cluster_4,tweets_by_cluster_0,tweets_by_cluster_1,tweets_by_cluster_2,tweets_by_cluster_3,tweets_by_cluster_4,domain
https://www.youtube.com/watch?v=psGpIuNh_dU,8,1288,11827,3505,398,2,1386,10827,5,544,...,146,1109,5,5,0,3,3,0,0,youtube.com
http://www.gand.uscourts.gov/news/notice-audio-streaming-court-proceeding-pearson-et-al-v-kemp-et-al,27,615,8658,2690,143,14,620,8224,2,410,...,72,467,7,1,0,2,24,1,0,gand.uscourts.gov
https://trulytimes.com/election-supervisor-shows-on-video-how-dominion-software-allows-changing-adding-votes.html?amp=1,8,683,9658,2677,168,1,655,8835,4,676,...,85,473,4,1,0,1,4,0,0,trulytimes.com
https://www.nytimes.com/2012/10/07/us/politics/as-more-vote-by-mail-faulty-ballots-could-impact-elections.html,105,834,12193,2562,143,13,748,9956,24,1205,...,92,464,8,3,1,24,48,2,0,nytimes.com
https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp,264,6543,41078,2555,305,62,2641,15021,61,3250,...,264,966,39,34,14,27,93,4,8,foxnews.com
https://www.theepochtimes.com/pennsylvania-house-leaders-file-brief-to-support-texas-in-supreme-court-lawsuit-against-pennsylvania_3613557.html,2,575,8275,2507,154,2,550,7679,3,353,...,53,423,5,2,0,1,0,0,0,theepochtimes.com
https://www.thegatewaypundit.com/2020/11/michigan-judge-tossed-pro-trump-voter-fraud-suit-caught-releasing-child-molesters/?utm_source=Twitter&utm_campaign=websitesharingbuttons,249,1044,7721,2020,300,84,972,6790,4,172,...,45,856,0,1,0,12,217,1,0,thegatewaypundit.com
https://www.legislationline.org/download/id/1472/file/3b50795b2d0374cbef5c29766256.pdf,148,1973,19221,1793,139,26,1024,9747,24,2021,...,161,509,2,8,0,30,73,0,0,legislationline.org
https://columbusfreepress.com/article/scytl-has-all-tools-it-needs-election-fraud,158,439,6397,1773,139,47,389,5046,3,185,...,28,320,4,1,0,14,123,1,2,columbusfreepress.com
https://thedcpatriot.com/report-were-casino-addresses-listed-as-residences-of-voters-in-nevada-of-course-they-were/,4,233,4951,1714,77,3,228,4599,1,116,...,30,168,3,4,0,1,2,1,0,thedcpatriot.com


In [37]:
df_youtube.sort_values("retweet_count_suspended_users", ascending=False)[:10]

Unnamed: 0,video_title,video_description,channel_id,channel_title,published_at,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_cluster_0,retweet_count_cluster_0,...,quote_count_cluster_2,tweet_count_cluster_3,retweet_count_cluster_3,quote_count_cluster_3,tweet_count_cluster_4,retweet_count_cluster_4,quote_count_cluster_4,tweet_count_suspended_users,retweet_count_suspended_users,quote_count_suspended_users
psGpIuNh_dU,Articia Bomer Witnessed Dems Counting Spoiled ...,Articia Bomer Witnessed Dems Counting Spoiled ...,UCekdKDk6d8opUrUEXtXD4XQ,Precinct 13,2020-11-23T10:32:13Z,88,11909,1301,0,5,...,1118,0,39,5,0,50,5,34,3527,401
dB0h_50OfzA,Trump confidant: the president has all the pro...,Steve Mosher of the Population Research Instit...,UCYImiD9L0dMycenfBy2al0Q,LifeSiteNews,2020-11-13T22:14:18Z,27,937,72,0,0,...,76,0,6,0,0,7,0,4,391,21
VDf1j4IQz28,Bobby Piton testifies at AZ Election Fraud Hea...,Bobby Piton testifies at AZ Election Fraud Hea...,UCSKoabidZh5qoTF4Xg96K7A,Three Headed Eagle Alliance,2020-11-30T22:54:54Z,65,919,45,0,0,...,28,0,0,0,0,5,1,9,266,6
Avy8eCHYd6I,11.13.20: D@Mini@n GOT CAUGHT! Amazing details...,Protect Your Retirement W/ A Gold. IRA\nhttps:...,UCzkAzJ2vQgRyEDzHFiKOaQA,AWK NEWS,2020-11-13T14:09:33Z,9,738,85,0,0,...,61,0,2,0,0,1,0,1,240,26
fWLD-tL42kI,"11.14.20: ""Release the KRAKEN"" says @SidneyPow...",Prepare for Emergencies with a Food Supply\nht...,UCzkAzJ2vQgRyEDzHFiKOaQA,AWK NEWS,2020-11-14T13:35:51Z,7,650,102,0,0,...,69,0,2,2,0,1,0,2,239,40
KIlI46HdqKg,Smoking Gun: ES&S (Philly's Computer System) T...,In this video you will see data from the NYT f...,UCIxc8YMkny2KBaD5TQsSbpg,Edward Solomon,2020-11-20T22:54:00Z,13,781,50,0,2,...,44,0,2,1,0,3,0,1,237,12
VgMPDnWunqs,"AG Barr releases memo on election fraud, DOJ o...",The head of the Justice Department’s elections...,UCNbIDJNNgaRrXOD7VllIMRQ,One America News Network,2020-11-11T23:58:41Z,150,1605,100,1,0,...,28,0,2,0,0,3,0,27,235,8
vb4zfltMBGg,11.24.20: This is about the SURVIVAL of the GR...,"🌊 Heat & Boil Water In 30 Minutes Anywhere, Un...",UCzkAzJ2vQgRyEDzHFiKOaQA,AWK NEWS,2020-11-25T05:27:13Z,4,601,54,0,1,...,37,0,0,0,0,0,0,0,220,19
RGLHAs8kSJ8,12.4.20: SMOKING Guns everywhere! The WHOLE WO...,😎 Get The Same Solar Power Bank I Have Here: ☀...,UCzkAzJ2vQgRyEDzHFiKOaQA,AWK NEWS,2020-12-05T04:48:56Z,18,621,76,0,0,...,72,0,1,2,0,1,0,7,208,27
lriJ5mpuVY8,CNN denies voting fraud despite own reports fr...,Footage from CNN reveals they previously consi...,UCNbIDJNNgaRrXOD7VllIMRQ,One America News Network,2020-12-09T22:12:23Z,13,371,26,0,1,...,23,0,0,0,0,0,0,5,195,9


In [17]:
df_hashtag.sort_values("retweet_count_suspended_users", ascending=False)[:10]

Unnamed: 0,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_cluster_0,retweet_count_cluster_0,quote_count_cluster_0,tweet_count_cluster_1,retweet_count_cluster_1,quote_count_cluster_1,tweet_count_cluster_2,...,quote_count_cluster_2,tweet_count_cluster_3,retweet_count_cluster_3,quote_count_cluster_3,tweet_count_cluster_4,retweet_count_cluster_4,quote_count_cluster_4,tweet_count_suspended_users,retweet_count_suspended_users,quote_count_suspended_users
stopthesteal,618759,2904540,285153,17662,15201,7750,60645,96515,20926,516242,...,149113,3921,15387,1947,1734,12191,1245,188792,557301,48784
electionfraud,168633,748128,68344,5928,4617,1607,16711,29138,5988,133381,...,47773,342,4215,476,411,3511,293,48761,188020,15735
voterfraud,214111,725446,75793,8423,7925,1677,19912,20717,5539,170801,...,43946,473,2605,409,695,3200,381,63689,140311,16167
fightback,62468,417935,39542,391,460,187,3448,12643,2154,57416,...,20179,106,2009,194,100,1668,109,28268,99819,7402
maga,91585,425527,41211,8610,6626,1599,8976,17060,4606,64001,...,25825,316,1778,269,291,3565,397,29267,97692,9734
wethepeople,20906,296242,34428,447,425,564,2638,19564,4282,16566,...,22490,100,1444,200,58,1628,163,7256,74255,7737
georgia,19460,289947,25900,2222,2748,605,2067,14496,2090,13905,...,14937,60,2556,185,120,3057,139,5235,68927,4928
trump2020,103816,291376,25995,1896,1192,883,10306,4251,1601,82310,...,16403,291,709,128,404,1218,161,35448,66898,6767
election2020,75853,396909,54530,5765,22384,6384,6553,16005,2990,52048,...,17800,137,1710,254,324,1567,159,16522,57469,7516
trumpwon,42972,193667,12680,314,124,120,2506,5791,878,39453,...,8563,75,1349,74,133,1030,56,19549,56506,3021


In [32]:
df_hashtag.head()

Unnamed: 0,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_cluster_0,retweet_count_cluster_0,quote_count_cluster_0,tweet_count_cluster_1,retweet_count_cluster_1,quote_count_cluster_1,tweet_count_cluster_2,...,tweet_count_cluster_4,retweet_count_cluster_4,quote_count_cluster_4,tweet_count_suspended_users,retweet_count_suspended_users,quote_count_suspended_users,retweet_count_detractors,retweet_count_promoters,tweet_count_detractors,tweet_count_promoters
stopthesteal,618759,2904540,285153,17662,15201,7750,60645,96515,20926,516242,...,1734,12191,1245,188792,557301,48784,15201,1561790,17662,582542
electionfraud,168633,748128,68344,5928,4617,1607,16711,29138,5988,133381,...,411,3511,293,48761,188020,15735,4617,500632,5928,150845
voterfraud,214111,725446,75793,8423,7925,1677,19912,20717,5539,170801,...,695,3200,381,63689,140311,16167,7925,394145,8423,191881
maga,91585,425527,41211,8610,6626,1599,8976,17060,4606,64001,...,291,3565,397,29267,97692,9734,6626,261084,8610,73584
fightback,62468,417935,39542,391,460,187,3448,12643,2154,57416,...,100,1668,109,28268,99819,7402,460,251893,391,61070


In [31]:
df_hashtag["retweet_count_detractors"] = df_hashtag.apply(lambda h: h["retweet_count_cluster_0"], axis=1)
df_hashtag["retweet_count_promoters"] = df_hashtag.apply(lambda h: h["retweet_count_cluster_1"] + h["retweet_count_cluster_2"] + h["retweet_count_cluster_3"] + h["retweet_count_cluster_4"], axis=1)

df_hashtag["tweet_count_detractors"] = df_hashtag.apply(lambda h: h["tweet_count_cluster_0"], axis=1)
df_hashtag["tweet_count_promoters"] = df_hashtag.apply(lambda h: h["tweet_count_cluster_1"] + h["tweet_count_cluster_2"] + h["tweet_count_cluster_3"] + h["tweet_count_cluster_4"], axis=1)

In [10]:
df_hashtag.loc["maga"]

tweet_count                       91585
retweet_count_metadata           425527
quote_count_metadata              41211
tweet_count_cluster_0              8610
retweet_count_cluster_0            6626
quote_count_cluster_0              1599
tweet_count_cluster_1              8976
retweet_count_cluster_1           17060
quote_count_cluster_1              4606
tweet_count_cluster_2             64001
retweet_count_cluster_2          238681
quote_count_cluster_2             25825
tweet_count_cluster_3               316
retweet_count_cluster_3            1778
quote_count_cluster_3               269
tweet_count_cluster_4               291
retweet_count_cluster_4            3565
quote_count_cluster_4               397
tweet_count_suspended_users       29267
retweet_count_suspended_users     97692
quote_count_suspended_users        9734
Name: maga, dtype: int64

In [33]:
for metric in ["promoters", "detractors", "suspended_users"]:
    print("Top 10 by ", metric)
    print(df_hashtag.nlargest(10, "retweet_count_{}".format(metric))[["retweet_count_{}".format(metric), "tweet_count_{}".format(metric)]])

Top 10 by  promoters
               retweet_count_promoters  tweet_count_promoters
stopthesteal                   1561790                 582542
electionfraud                   500632                 150845
voterfraud                      394145                 191881
maga                            261084                  73584
fightback                       251893                  61070
wethepeople                     208667                  19362
georgia                         191745                  16152
election2020                    177869                  59062
trump2020                       170154                  93311
trumpwon                        136153                  42167
Top 10 by  detractors
                     retweet_count_detractors  tweet_count_detractors
election2020                            22384                    5765
protect2020                             15297                     124
stopthesteal                            15201                   1

## Candidate shares


## Media

In [22]:
df_media_with_tweets = pd.read_pickle(DATAFRAMES_DIR + 'df_media_with_tweets.pickle')
df_media_with_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201259 entries, 5327346445844480 to 5114016670154752
Data columns (total 53 columns):
 #   Column                      Non-Null Count   Dtype           
---  ------                      --------------   -----           
 0   w_hash                      201259 non-null  object          
 1   tweet_id                    201259 non-null  object          
 2   p_hash                      201259 non-null  object          
 3   media_id                    201259 non-null  object          
 4   a_hash                      201259 non-null  object          
 5   type                        201259 non-null  object          
 6   media_url                   201259 non-null  object          
 7   urls                        201259 non-null  object          
 8   hasMedia                    201259 non-null  bool            
 9   hashtags                    201259 non-null  object          
 10  retweet_count               201259 non-null  int32          

In [23]:
relevant_media_dict_cols = [key for key in next(iter(content_shares_by_cluster["media_by_tweet_id"].values())).keys() if key not in ["tweet_count", "retweet_count", "quote_count"] and not key.startswith("tweets_by_")]

relevant_media_dict_cols

['retweets_by_suspended',
 'quotes_by_suspended',
 'quotes_crawled',
 'retweets_crawled',
 'retweets_by_cluster_0',
 'retweets_by_cluster_1',
 'retweets_by_cluster_2',
 'retweets_by_cluster_3',
 'retweets_by_cluster_4',
 'quotes_by_cluster_0',
 'quotes_by_cluster_1',
 'quotes_by_cluster_2',
 'quotes_by_cluster_3',
 'quotes_by_cluster_4']

In [13]:
with open("./candidate_shares.pickle", "rb") as f:
    candidates_json = pickle.load(f)
    candidate_shares = candidates_json["candidate_shares"]
    candidate_map = candidates_json["candidate_map"]

In [30]:
promoter_cluster_p_hashes = set(["b9b7628638c935ca", "d47a09e12cb81fc7", "9a2f6093d0aadcec"])
detractor_cluster_p_hashes = set(["db89245c35ad52ad", "d5ad2ad8d5274658", "9ab74070ec46f0bd"])

media_shares_by_candidates = {
    "retweets": defaultdict(lambda: []),
    "tweets": defaultdict(lambda: [])
}

for i, tweet_id, user, p_hash in df_media_with_tweets[["tweet_id", "user", "p_hash"]].itertuples():
    if (user in candidate_map):
        media_shares_by_candidates["tweets"][user].append(tweet_id)

        if (p_hash in promoter_cluster_p_hashes):
            print("p_hash match, promoter tweet")

        if (p_hash in detractor_cluster_p_hashes):
            print("p_hash match, detractor tweet")

    if (tweet_id in candidate_shares):
        media_shares_by_candidates["retweets"][user].append(tweet_id)

        if (p_hash in promoter_cluster_p_hashes):
            print("p_hash match, promoter retweet")

        if (p_hash in detractor_cluster_p_hashes):
            print("p_hash match, detractor retweet")


In [24]:
media_map = content_shares_by_cluster["media_by_tweet_id"]
def get_media_metrics (tweet_id, col):
    if (tweet_id in media_map):
        return media_map[tweet_id][col]
    else:
        raise ("Missing media tweet id", tweet_id)
        
for col in relevant_media_dict_cols:
    df_media_with_tweets[col] = df_media_with_tweets.apply(lambda x: get_media_metrics(x["tweet_id"], col), axis=1)

In [25]:
import pickle 
with open("./graph_with_communities.pickle", "rb") as f:
    graph_with_communities = pickle.load(f)

In [26]:
user_community_map = {}

for node, data in graph_with_communities.nodes(data=True):
    user_community_map[data["user_id"]] = data["community"]

print(len(user_community_map))

def get_user_community(user_id):
    if user_id in user_community_map:
        return user_community_map[user_id]
    else:
        return np.nan

df_media_with_tweets["user_cluster"] = df_media_with_tweets.apply(lambda x: get_user_community(x["user"]), axis=1).astype('Int64')

1697944


In [None]:
df_media_with_tweets[df_media_with_tweets.p_hash.notna()]

In [27]:
df_media_with_tweets[df_media_with_tweets.duplicated("tweet_id")].sort_values("retweet_count", ascending=False)[:3][["tweet_id", "media_id", "user", "user_cluster"] + relevant_media_dict_cols]

Unnamed: 0_level_0,tweet_id,media_id,user,user_cluster,retweets_by_suspended,quotes_by_suspended,quotes_crawled,retweets_crawled,retweets_by_cluster_0,retweets_by_cluster_1,retweets_by_cluster_2,retweets_by_cluster_3,retweets_by_cluster_4,quotes_by_cluster_0,quotes_by_cluster_1,quotes_by_cluster_2,quotes_by_cluster_3,quotes_by_cluster_4
datastore_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6245933826179072,1324064583355494400,1324064556117733376,999153867374841856,1,682,140,1583,6318,21,1986,2014,1187,283,13,135,415,274,110
5894090105290752,1324064583355494400,1324064576673968128,999153867374841856,1,682,140,1583,6318,21,1986,2014,1187,283,13,135,415,274,110
5964458849468416,1324064583355494400,1324064568486764548,999153867374841856,1,682,140,1583,6318,21,1986,2014,1187,283,13,135,415,274,110


In [15]:
df_media_with_tweets["media_id"].nunique()

196330

In [31]:
import pandas as pd

df_media_with_tweets['timestamp'] = pd.to_datetime(df_media_with_tweets['timestamp'])
for_export = df_media_with_tweets[(df_media_with_tweets['timestamp'] > '2020-10-23 00:00:00')  & (df_media_with_tweets['w_hash'] != "NA")][[
#'media_url', 'p_hash', 'w_hash', 'a_hash', 'media_id', 'tweet_id', 'hashtags', 'user', 'user_cluster', 'type', 'retweet_count', 'quote_count', 'timestamp'
#] + relevant_media_dict_cols]
    'tweet_id', 'a_hash', 'p_hash', 'w_hash'
]] 

#print(for_export['timestamp'].min())
#print(for_export['timestamp'].max())
print(for_export.shape)
print(for_export.columns)
for_export.to_csv("./data_export/final/images.csv", index_label="unique_id")

(167696, 4)
Index(['tweet_id', 'a_hash', 'p_hash', 'w_hash'], dtype='object')


In [32]:
for_export.to_csv("./data_export/final/samples/images-sample.csv", index_label="unique_id")

In [46]:
import pandas as pd

for_export = df_media_with_tweets[[
    'media_url', 'p_hash', 'w_hash', 'a_hash', 'media_id', 'tweet_id', 'hashtags', 'user', 'user_cluster', 'type', 'retweet_count', 'quote_count', 'timestamp'
] + relevant_media_dict_cols]

for_export['timestamp'] = pd.to_datetime(for_export['timestamp'])
for_export = for_export[(for_export['timestamp'] > '2020-10-23 00:00:00') & (for_export['w_hash'] != "NA")]
print(for_export['timestamp'].min())
print(for_export['timestamp'].max())
print(for_export.shape)
print(for_export.columns)
for_export.to_csv("./data_export/final/media.csv", index_label="datastore_id")

(201259, 4)
Index(['tweet_id', 'a_hash', 'p_hash', 'w_hash'], dtype='object')


In [57]:
df_media_with_tweets["media_id"].nunique()

196330

In [58]:
df_media_with_tweets[df_media_with_tweets.w_hash != "NA"]["media_id"].nunique()

164774

In [18]:
print(df_recent_tweets["timestamp"].min())
print(df_recent_tweets["timestamp"].max())

2020-10-23T17:00:04Z
2020-12-16T13:08:49Z


In [42]:
# Test export
df = pd.read_csv("./data_export/final/media.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201259 entries, 0 to 201258
Data columns (total 28 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   datastore_id           201259 non-null  int64  
 1   media_url              201259 non-null  object 
 2   p_hash                 167696 non-null  object 
 3   w_hash                 167696 non-null  object 
 4   a_hash                 167696 non-null  object 
 5   media_id               201259 non-null  int64  
 6   tweet_id               201259 non-null  int64  
 7   hashtags               201259 non-null  object 
 8   user                   201259 non-null  int64  
 9   user_cluster           159574 non-null  float64
 10  type                   201259 non-null  object 
 11  retweet_count          201259 non-null  int64  
 12  quote_count            201259 non-null  int64  
 13  timestamp              201259 non-null  object 
 14  retweets_by_suspended  201259 non-nu

In [129]:
df.sort_values("retweets_by_cluster_1", ascending=False)

Unnamed: 0,datastore_id,media_url,p_hash,w_hash,a_hash,media_id,tweet_id,hashtags,user,user_cluster,...,retweets_by_cluster_0,retweets_by_cluster_1,retweets_by_cluster_2,retweets_by_cluster_3,retweets_by_cluster_4,quotes_by_cluster_0,quotes_by_cluster_1,quotes_by_cluster_2,quotes_by_cluster_3,quotes_by_cluster_4
8959,6641351693172736,http://pbs.twimg.com/amplify_video_thumb/13316...,,,,1331691016974446597,1331695288231333890,[],25073877,1.0,...,55,3195,9805,398,395,4,17,139,0,0
146404,6245933826179072,http://pbs.twimg.com/media/EmAFstcVMAAtFSs.jpg,830c29a73df3259d,131d3c3c3e3ebe08,131c3c3c3e3ebc0c,1324064556117733376,1324064583355494400,[],999153867374841856,1.0,...,21,1986,2014,1187,283,13,135,415,274,110
146406,5964458849468416,http://pbs.twimg.com/media/EmAFtbhVoAQFOoR.jpg,9a2f6093d0aadcec,e703933c3e1e5748,c701812c3c1e47c0,1324064568486764548,1324064583355494400,[],999153867374841856,1.0,...,21,1986,2014,1187,283,13,135,415,274,110
146403,6597777547067392,http://pbs.twimg.com/media/EmAFs4dUcAAm9aG.jpg,a9661439e4e4db1b,010177ffdf8f0b02,010037f7df8f0b00,1324064559074668544,1324064583355494400,[],999153867374841856,1.0,...,21,1986,2014,1187,283,13,135,415,274,110
146405,5894090105290752,http://pbs.twimg.com/media/EmAFt6BUcAAcs5r.jpg,bb608e97dc25b10b,3f6701011f0f1f1f,3f6341831f0f1f1f,1324064576673968128,1324064583355494400,[],999153867374841856,1.0,...,21,1986,2014,1187,283,13,135,415,274,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67449,4983273570697216,http://pbs.twimg.com/media/El5rE6cX0AEzHiE.jpg,8d5350f3d2e1b2a9,7e003b333b333b32,7e103b3333333332,1323613072645410817,1323613145701720070,[],817742628984909824,2.0,...,0,0,7,0,0,0,0,2,0,0
67450,6601754686783488,http://pbs.twimg.com/media/El5rQJeXUAYvD12.jpg,8ac775384ec7b0b8,0000ffffffff0000,0000ffff19000000,1323613265658859526,1323613278317219840,[],1314260098013974530,1.0,...,0,0,0,0,0,0,0,0,0,0
67451,6194288320839680,http://pbs.twimg.com/media/El5rdnpWoAAONuA.jpg,91f3e72c082c5776,efefee5f0d0a0200,e7efee590c000200,1323613497096314880,1323613499428405250,[],1313936553568276481,2.0,...,0,0,0,0,0,0,0,0,0,0
67452,6335025809195008,http://pbs.twimg.com/ext_tw_video_thumb/132361...,,,,1323613542642311168,1323613620404686849,['DemocratsTheEnemyWithin'],870850619397087232,2.0,...,0,0,0,0,0,0,0,0,0,0


# Users

In [15]:
promoters = df_users[(df_users.cluster == 1) | (df_users.cluster == 2) | (df_users.cluster == 3) | (df_users.cluster == 4)]

promoters.shape[0]

836968

## Promoters cluster ban percentage

In [17]:
promoters[promoters.active_status == "suspended"].shape[0] / promoters.shape[0]

0.07798267078311237

In [30]:
for x in [1000, 5000, 10000]:  
    users = df_users[df_users["retweets_by_r"] > x]
    total = users.shape[0]
    print("--Users retweeted more than {:,} times in right clusters".format(x))
    print("Count:", total)
    print("Author activity status distribution")
    for status, value in users.value_counts("active_status").to_dict().items():
        print(status, value, "({:,.2f}%)".format((value / total) * 100))
    print("Cluster distribution")
    for cluster, value in users.value_counts("cluster").to_dict().items():
        print(cluster, value)

    print("Retweets suspended ratio")
    print("Max", users.retweets_suspended_ratio.max())
    print("Mean", users.retweets_suspended_ratio.mean())
    print("Min", users.retweets_suspended_ratio.min())


    
    print()
    print()

--Users retweeted more than 1,000 times in right clusters
Count: 1596
Author activity status distribution
active 988 (61.90%)
suspended 560 (35.09%)
inactive 48 (3.01%)
Cluster distribution
1 812
2 727
3 26
4 13
0 4
Retweets suspended ratio
Max 0.6788844621513944
Mean 0.32772360586217436
Min 0.0


--Users retweeted more than 5,000 times in right clusters
Count: 501
Author activity status distribution
active 322 (64.27%)
suspended 168 (33.53%)
inactive 11 (2.20%)
Cluster distribution
1 341
2 149
3 5
Retweets suspended ratio
Max 0.5953736001468698
Mean 0.3087240788919267
Min 0.0055152394775036286


--Users retweeted more than 10,000 times in right clusters
Count: 273
Author activity status distribution
active 179 (65.57%)
suspended 88 (32.23%)
inactive 6 (2.20%)
Cluster distribution
1 208
2 58
3 1
Retweets suspended ratio
Max 0.5287391139719803
Mean 0.29908733942727034
Min 0.006570512820512821




# Tweets


In [2]:
tweets_active_status = pd.read_pickle("./df_recent_tweets_with_final_metrics.pickle")["author_active_status"]

In [3]:
tweets_active_status.value_counts()

active       5842842
suspended    1240405
inactive      519856
Name: author_active_status, dtype: int64

## Tweets with URLs

In [10]:
external_url_tweets = 0
for urls in df_recent_tweets.urls:
    for url in urls:
        if "twitter.com" not in url:
            external_url_tweets += 1
            break

print(external_url_tweets)

609513


In [25]:
for x in [100, 500, 1000, 5000]:  
    tweets = df_recent_tweets[df_recent_tweets["retweets_by_r"] > x]
    total = tweets.shape[0]
    print("--Tweets retweeted more than {:,} times in right clusters".format(x))
    print("Count:", total)
    print("Author activity status distribution")
    for status, value in tweets.value_counts("author_active_status").to_dict().items():
        print(status, value, "({:,.2f}%)".format((value / total) * 100))
    print("Cluster distribution (authors)")
    for cluster, value in tweets.value_counts("cluster").to_dict().items():
        print(cluster, value)

    available_tweets = tweets[tweets["author_active_status"] == "active"]

    print("For available tweets ({:,}):".format(available_tweets.shape[0]))
    retweets_crawled = available_tweets.retweets_crawled.sum()
    retweets_suspended = available_tweets.retweets_by_suspended.sum()
    print("Retweets crawled: {:,}".format(retweets_crawled))
    print("Retweets suspended: {:,} ({:,.2f}%)".format(retweets_suspended, (retweets_suspended / retweets_crawled) * 100))

    
    print()
    print()

--Tweets retweeted more than 100 times in right clusters
Count: 19464
Author activity status distribution
active 11559 (59.39%)
suspended 7467 (38.36%)
inactive 438 (2.25%)
Cluster distribution (authors)
1 10969
2 7753
3 286
4 112
0 38
For available tweets (11,559):
Retweets crawled: 10,308,355
Retweets suspended: 2,543,631 (24.68%)


--Tweets retweeted more than 500 times in right clusters
Count: 4935
Author activity status distribution
active 3233 (65.51%)
suspended 1593 (32.28%)
inactive 109 (2.21%)
Cluster distribution (authors)
1 3866
2 942
3 36
4 10
0 6
For available tweets (3,233):
Retweets crawled: 8,033,534
Retweets suspended: 1,967,933 (24.50%)


--Tweets retweeted more than 1,000 times in right clusters
Count: 2685
Author activity status distribution
active 1811 (67.45%)
suspended 823 (30.65%)
inactive 51 (1.90%)
Cluster distribution (authors)
1 2363
2 276
3 14
0 3
4 1
For available tweets (1,811):
Retweets crawled: 6,853,742
Retweets suspended: 1,656,724 (24.17%)


--Tweets

In [12]:
df_recent_tweets.nlargest(10, "retweet_count")

Unnamed: 0,datastore_id,urls,hasMedia,hashtags,retweet_count,quote_count,user,text,quote_tweet,timestamp,...,author_active_status,retweets_by_suspended,quotes_by_suspended,retweets_crawled,quotes_crawled,retweets_by_l,retweets_by_r,retweets_suspended_ratio,quote_coverage,retweet_coverage
7566713,1324357146088267777,[https://twitter.com/i/web/status/132435714608...,False,[],205823,8271,1416374742,It’s wild Trump supporters believe there’s vot...,,2020-11-05T14:25:23Z,...,active,110,13,38179,1684,38097,48,0.002881,0.203603,0.185494
2856721,1324420619539271680,[https://twitter.com/i/web/status/132442061953...,False,[],149829,9116,216776631,"In 2016, when Trump won PA, MI, and WI by a ha...",,2020-11-05T18:37:36Z,...,active,140,21,21992,1504,21819,73,0.006366,0.164985,0.146781
702702,1332352538855747584,[https://twitter.com/i/web/status/133235253885...,False,[],91902,27139,25073877,Biden can only enter the White House as Presid...,,2020-11-27T15:56:13Z,...,suspended,7702,1588,53428,17331,267,42901,0.144157,0.638601,0.581358
1540579,1338871862315667456,[],False,[],79140,14591,25073877,Tremendous evidence pouring in on voter fraud....,,2020-12-15T15:41:41Z,...,suspended,11883,1418,73245,14233,243,57703,0.162236,0.975464,0.925512
6454752,1330148622898515969,[],False,[],78904,13190,25073877,Big voter fraud information coming out concern...,,2020-11-21T13:58:39Z,...,suspended,8981,934,55987,9827,308,44456,0.160412,0.745034,0.709558
5477139,1336811823232921600,[https://twitter.com/i/web/status/133681182323...,False,[],76171,14055,25073877,Wow! At least 17 States have joined Texas in t...,,2020-12-09T23:15:50Z,...,suspended,8863,1691,62453,12129,195,49421,0.141915,0.862967,0.819905
1225216,1325432465415163904,[https://twitter.com/i/web/status/132543246541...,False,[],75385,13632,25073877,“We should look at the votes. We’re just begin...,,2020-11-08T13:38:19Z,...,suspended,4165,574,26038,4928,168,20932,0.159959,0.361502,0.3454
5800786,1332317394165968899,[https://twitter.com/i/web/status/133231739416...,False,[],73664,9768,25073877,Wow! Twitter bans highly respected Pennsylvani...,,2020-11-27T13:36:34Z,...,suspended,7722,936,51111,7046,141,40311,0.151083,0.721335,0.69384
816188,1327750127679889409,[https://twitter.com/i/web/status/132775012767...,False,[],71054,11185,25073877,There is tremendous evidence of wide spread vo...,,2020-11-14T23:07:53Z,...,suspended,9181,1068,55793,9268,224,43876,0.164555,0.82861,0.78522
526525,1337745268591259648,[https://twitter.com/i/web/status/133774526859...,False,[],66741,54083,25073877,"I WON THE ELECTION IN A LANDSLIDE, but remembe...",,2020-12-12T13:05:00Z,...,suspended,8307,4525,55290,42386,211,43843,0.150244,0.783721,0.828426


In [3]:
tweets_by_cluster = df_recent_tweets.groupby(['cluster']).agg(
    count=("datastore_id", "count")
)

In [4]:
tweets_by_cluster

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
0,1199587
1,644219
2,3982990
3,27699
4,24191


## Export

### Users


In [3]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2559018 entries, 3288305608 to 1313243498
Data columns (total 29 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   created_at                object 
 1   friends_count             Int64  
 2   name                      object 
 3   verified                  object 
 4   followers_count           Int64  
 5   location                  object 
 6   handle                    object 
 7   data_source               object 
 8   active_status             object 
 9   cluster                   Int64  
 10  l_closeness               float64
 11  r_closeness               float64
 12  retweets_by_cluster_0     int64  
 13  quotes_by_cluster_0       int64  
 14  retweets_by_cluster_1     int64  
 15  quotes_by_cluster_1       int64  
 16  retweets_by_cluster_2     int64  
 17  quotes_by_cluster_2       int64  
 18  retweets_by_cluster_3     int64  
 19  quotes_by_cluster_3       int64  
 20  retweets_by_clust

In [4]:
df_users_export = df_users.sort_values("created_at").rename(columns=user_rename_dict)

In [5]:
df_users_export.head()

Unnamed: 0_level_0,created_at,friends_count,name,verified,followers_count,location,handle,data_source,user_active_status,user_community,...,quote_count_by_community_3,retweet_count_by_community_4,quote_count_by_community_4,retweet_count_by_suspended_users,quote_count_by_suspended_users,retweets_crawled,quotes_crawled,retweets_by_l,retweets_by_r,retweets_suspended_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6590,2006-09-21T18:27:47Z,4599,CyberBenB,False,3527,🖤❤️💛🐧🏴‍☠️😷🏳️‍🌈☯️♻️🌍📢,cyberbenb,old,suspended,0.0,...,0,0,0,0,0,0,0,0,0,0.0
10597,2006-10-26T14:18:36Z,243,Ferdinand Reinke,False,52,08824,reinkefj,old,deleted,,...,0,0,0,0,0,0,0,0,0,0.0
16943,2006-11-23T03:21:03Z,866,Drew Robinson,False,1320,"Perth, WA, Australia",ocean,old,suspended,,...,0,0,0,0,0,0,0,0,0,0.0
29803,2006-11-29T06:57:41Z,6898,Nichelle “On Neida and Mildred” Stephens,False,8563,"Savannah, GA",niche,old,suspended,,...,0,0,0,0,0,1,0,0,0,0.0
68093,2006-12-14T18:41:19Z,2581,Margery,False,1696,"Massachusetts, USA",margery,old,suspended,0.0,...,0,0,0,1,0,16,0,16,0,0.0625


In [6]:
df_users_export[user_export_cols]

Unnamed: 0_level_0,user_community,user_active_status,closeness_centrality_detractor_cluster,closeness_centrality_promoter_cluster,retweet_count_by_community_0,quote_count_by_community_0,retweet_count_by_community_1,quote_count_by_community_1,retweet_count_by_community_2,quote_count_by_community_2,retweet_count_by_community_3,quote_count_by_community_3,retweet_count_by_community_4,quote_count_by_community_4,retweet_count_by_suspended_users,quote_count_by_suspended_users
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
6590,0,suspended,,,0,0,0,0,0,0,0,0,0,0,0,0
10597,,deleted,,,0,0,0,0,0,0,0,0,0,0,0,0
16943,,suspended,,,0,0,0,0,0,0,0,0,0,0,0,0
29803,,suspended,,,0,0,0,0,0,0,0,0,0,0,0,0
68093,0,suspended,0.076194,,16,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899317849243693057,2,suspended,,,0,0,0,0,0,0,0,0,0,0,0,0
1296302672077758464,2,suspended,,,0,0,0,0,0,0,0,0,0,0,0,0
1403510982,1,deleted,,,0,0,0,0,0,0,0,0,0,0,0,0
1331876413926936577,1,suspended,,,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
df_users_export[user_export_cols].to_csv("./data_export/final/users.csv", index_label="user_id")

In [16]:
for i, df_chunk in enumerate(split_dataframe(df_users_export[user_export_cols], chunk_size=500000)):
    df_chunk.to_csv("./data_export/final/users-{}.csv".format(i), index_label="user_id")

In [20]:
df_users_export[:SAMPLE_SIZE][user_export_cols].to_csv("./data_export/final/samples/users-sample.csv", index_label="user_id")

In [7]:
df_users_export[user_export_cols].to_csv("./data_export/final/users.csv", index_label="user_id")

### Tweets

In [32]:
df_recent_tweets[:10]

Unnamed: 0,datastore_id,urls,hasMedia,hashtags,retweet_count,quote_count,user,text,quote_tweet,timestamp,...,author_active_status,retweets_by_suspended,quotes_by_suspended,retweets_crawled,quotes_crawled,retweets_by_l,retweets_by_r,retweets_suspended_ratio,quote_coverage,retweet_coverage
0,1327172028709801985,[https://twitter.com/i/web/status/132717202870...,False,"[auspol, ConcedeNowTrump]",0,0,2313341894,"@realDonaldTrump ""No evidence of US election f...",,2020-11-13T08:50:43Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
1,1327172032002486273,[https://twitter.com/i/web/status/132717203200...,False,[],0,0,750121838,@TRUTHandFREED0M @kpolantz Plus the evidence o...,,2020-11-13T08:50:44Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
2,1327172032061173761,[],False,[],0,0,546655747,Hear HEAR @RealDLHughley,1.326904412527276e+18,2020-11-13T08:50:44Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
3,1327172033143316480,[https://www.bbc.co.uk/news/election-us-2020-5...,False,[],0,0,25709850,US election security officials reject Trump's ...,,2020-11-13T08:50:45Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
4,1327172039669665794,[],True,[],0,0,1254057289113317382,The election results themselves are disputed b...,,2020-11-13T08:50:46Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
5,1327172041091526657,[https://twitter.com/i/web/status/132717204109...,False,[],0,0,58768691,"""On Newsmax, voter fraud innuendo is everywher...",,2020-11-13T08:50:46Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
6,1327172041473187841,[https://twitter.com/i/web/status/132717204147...,False,[],8,1,824158324094619650,Think about this: Some people are saying Trump...,,2020-11-13T08:50:47Z,...,active,4,2,7,2,0,7,0.571429,2.0,0.875
7,1327172050499330049,[https://twitter.com/i/web/status/132717205049...,False,[FoxNewsIsDead],0,0,225635916,If it's on @FoxNews so it must be true.🤣🤣 #Fox...,1.3270385868191457e+18,2020-11-13T08:50:49Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
8,1327172052202078208,[https://twitter.com/i/web/status/132717205220...,False,[],0,0,4505958373,"Dude, WHY bring the word EXECUTION into the Ei...",1.3270997609273795e+18,2020-11-13T08:50:49Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
9,1327172055154954241,[],False,[],0,0,854432647610871808,CC @realDonaldTrump,1.326951148721496e+18,2020-11-13T08:50:50Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0


In [4]:
df_recent_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7603103 entries, 0 to 7609004
Data columns (total 68 columns):
 #   Column                      Dtype           
---  ------                      -----           
 0   datastore_id                object          
 1   urls                        object          
 2   hasMedia                    bool            
 3   hashtags                    object          
 4   retweet_count               int32           
 5   quote_count                 int32           
 6   user                        object          
 7   text                        object          
 8   quote_tweet                 object          
 9   timestamp                   object          
 10  tokens                      object          
 11  election fraud              Sparse[int64, 0]
 12  voter fraud                 Sparse[int64, 0]
 13  #voterfraud                 Sparse[int64, 0]
 14  #stopthesteal               Sparse[int64, 0]
 15  #ballotharvesting           Spar

In [5]:
rename_tweet_cols = {
    **rename_dict,
    "datastore_id": "tweet_id",
    "cluster": "user_community",
    "retweet_count": "retweet_count_metadata",
    "quote_count": "quote_count_metadata",
}
print(rename_tweet_cols)

{'retweets_by_cluster_0': 'retweet_count_by_community_0', 'quotes_by_cluster_0': 'quote_count_by_community_0', 'tweets_by_cluster_0': 'tweet_count_by_community_0', 'retweets_by_cluster_1': 'retweet_count_by_community_1', 'quotes_by_cluster_1': 'quote_count_by_community_1', 'tweets_by_cluster_1': 'tweet_count_by_community_1', 'retweets_by_cluster_2': 'retweet_count_by_community_2', 'quotes_by_cluster_2': 'quote_count_by_community_2', 'tweets_by_cluster_2': 'tweet_count_by_community_2', 'retweets_by_cluster_3': 'retweet_count_by_community_3', 'quotes_by_cluster_3': 'quote_count_by_community_3', 'tweets_by_cluster_3': 'tweet_count_by_community_3', 'retweets_by_cluster_4': 'retweet_count_by_community_4', 'quotes_by_cluster_4': 'quote_count_by_community_4', 'tweets_by_cluster_4': 'tweet_count_by_community_4', 'tweets_by_suspended': 'tweet_count_by_suspended_users', 'quotes_by_suspended': 'quote_count_by_suspended_users', 'retweets_by_suspended': 'retweet_count_by_suspended_users', 'author_a

In [6]:
tweet_cols

['user_community',
 'user_active_status',
 'retweet_count_metadata',
 'quote_count_metadata']

In [9]:
df_tweets_export = df_recent_tweets.rename(rename_tweet_cols, axis="columns").set_index("tweet_id").sort_values("timestamp")

In [10]:
df_tweets_export["date"] = df_tweets_export.apply(lambda t: extract_date(t["timestamp"]), axis=1)

In [17]:
df_tweets_export = df_tweets_export[tweet_cols + retweeted_cols + ["date"]] 

df_tweets_export.head()

Unnamed: 0_level_0,user_community,user_active_status,retweet_count_metadata,quote_count_metadata,retweet_count_by_community_0,quote_count_by_community_0,retweet_count_by_community_1,quote_count_by_community_1,retweet_count_by_community_2,quote_count_by_community_2,retweet_count_by_community_3,quote_count_by_community_3,retweet_count_by_community_4,quote_count_by_community_4,retweet_count_by_suspended_users,quote_count_by_suspended_users,date
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1319685030826938369,2.0,inactive,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-10-23
1319685039714717696,,active,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-10-23
1319685089375158274,,active,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-10-23
1319685151031500801,,suspended,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-10-23
1319685176436396035,2.0,inactive,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-10-23


In [18]:
df_tweets_export.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7603103 entries, 1319685030826938369 to 1339195780704382980
Data columns (total 17 columns):
 #   Column                            Dtype 
---  ------                            ----- 
 0   user_community                    Int64 
 1   user_active_status                object
 2   retweet_count_metadata            int32 
 3   quote_count_metadata              int32 
 4   retweet_count_by_community_0      int64 
 5   quote_count_by_community_0        int64 
 6   retweet_count_by_community_1      int64 
 7   quote_count_by_community_1        int64 
 8   retweet_count_by_community_2      int64 
 9   quote_count_by_community_2        int64 
 10  retweet_count_by_community_3      int64 
 11  quote_count_by_community_3        int64 
 12  retweet_count_by_community_4      int64 
 13  quote_count_by_community_4        int64 
 14  retweet_count_by_suspended_users  int64 
 15  quote_count_by_suspended_users    int64 
 16  date                         

In [8]:
df_tweets_export.to_pickle("./df_tweets_export.pickle")

In [2]:
df_tweets_export = pd.read_pickle("./df_tweets_export.pickle")

In [5]:
df_tweets_export["user_active_status"] = df_tweets_export.apply(lambda t: map_user_status(t["user_active_status"]), axis=1)

df_tweets_export["user_active_status"].value_counts()

active       5842842
suspended    1240405
deleted       519856
Name: user_active_status, dtype: int64

In [3]:
df_tweets_export["month"] = df_tweets_export.apply(lambda t: extract_month_from_date(t["date"]), axis=1)
print(df_tweets_export.value_counts("month"))

month
2020-11    5067170
2020-12    2326439
2020-10     209494
dtype: int64


In [6]:
months = df_tweets_export.value_counts("month").index
print(df_tweets_export.value_counts("month"))

month
2020-11    5067170
2020-12    2326439
2020-10     209494
dtype: int64


In [7]:
dates = df_tweets_export.value_counts("date").index
print(df_tweets_export.value_counts("date"))
print(df_tweets_export.value_counts("date").sum())

date
2020-11-08    235919
2020-11-07    226763
2020-11-04    215303
2020-11-09    214297
2020-11-12    211928
2020-11-13    211857
2020-11-05    210509
2020-11-06    207111
2020-11-11    206726
2020-11-20    201131
2020-11-14    196038
2020-12-03    195279
2020-12-04    193360
2020-11-15    192879
2020-11-10    188568
2020-12-02    187500
2020-11-18    184239
2020-12-01    177588
2020-12-12    169345
2020-11-19    168638
2020-11-30    166738
2020-12-06    161904
2020-11-17    160088
2020-12-05    159894
2020-11-22    158702
2020-11-16    158490
2020-11-28    156268
2020-11-21    153956
2020-12-10    153413
2020-11-25    151395
2020-11-29    150275
2020-11-24    147488
2020-11-23    146170
2020-11-27    145193
2020-11-26    138883
2020-12-15    135706
2020-12-09    134030
2020-12-07    130496
2020-12-08    126510
2020-12-14    123276
2020-12-11    119214
2020-12-13    106339
2020-11-03     84523
2020-10-25     57640
2020-12-16     52585
2020-11-02     45353
2020-10-28     36175
2020-11-

In [9]:
for d in dates:
    print(d)
    date_tweets = df_tweets_export[df_tweets_export["date"] == d].drop(["date", "month"], axis=1)
    date_tweets.to_csv("./data_export/final/tweets/tweets-{}.csv".format(d), index_label="tweet_id")

2020-11-08
2020-11-07
2020-11-04
2020-11-09
2020-11-12
2020-11-13
2020-11-05
2020-11-06
2020-11-11
2020-11-20
2020-11-14
2020-12-03
2020-12-04
2020-11-15
2020-11-10
2020-12-02
2020-11-18
2020-12-01
2020-12-12
2020-11-19
2020-11-30
2020-12-06
2020-11-17
2020-12-05
2020-11-22
2020-11-16
2020-11-28
2020-11-21
2020-12-10
2020-11-25
2020-11-29
2020-11-24
2020-11-23
2020-11-27
2020-11-26
2020-12-15
2020-12-09
2020-12-07
2020-12-08
2020-12-14
2020-12-11
2020-12-13
2020-11-03
2020-10-25
2020-12-16
2020-11-02
2020-10-28
2020-11-01
2020-10-31
2020-10-24
2020-10-27
2020-10-29
2020-10-30
2020-10-23
2020-10-26


In [7]:
df_tweets_export.columns

Index(['user_community', 'user_active_status', 'retweet_count_metadata',
       'quote_count_metadata', 'retweet_count_by_community_0',
       'quote_count_by_community_0', 'retweet_count_by_community_1',
       'quote_count_by_community_1', 'retweet_count_by_community_2',
       'quote_count_by_community_2', 'retweet_count_by_community_3',
       'quote_count_by_community_3', 'retweet_count_by_community_4',
       'quote_count_by_community_4', 'retweet_count_by_suspended_users',
       'quote_count_by_suspended_users', 'date', 'month'],
      dtype='object')

In [10]:
for m in months:
    print(m)
    month_tweets = df_tweets_export[df_tweets_export["month"] == m]
    month_tweets.drop(["month", "date"], axis=1).to_csv("./data_export/final/tweets-{}.csv".format(m), index_label="tweet_id")

2020-11
2020-12
2020-10


In [11]:
df_tweets_export[:SAMPLE_SIZE].drop(["month", "date"], axis=1).to_csv("./data_export/final/samples/tweets-sample.csv")

# Retweets

In [14]:
df_users = pd.read_pickle("./df_users_final.pickle")

In [7]:
suspended_user_ids = set(df_users[df_users["active_status"] == "suspended"].index)
print(len(suspended_user_ids))

99884


In [4]:
df_retweets = pd.read_pickle("./df_retweets_with_cluster.pickle")

In [8]:
suspended_retweets = 0
for user in df_retweets["user"]:
    if user in suspended_user_ids:
        suspended_retweets += 1
print(suspended_retweets)

6246245


In [5]:
df_retweets.columns

Index(['user', 'timestamp', 'retweeted', 'retweetedFrom_user', 'cluster'], dtype='object')

In [3]:
df_retweets.value_counts("cluster")

cluster
2    16403520
0     3805080
1     1595997
3      267116
4      190078
dtype: int64

In [4]:
retweets_export = df_retweets.set_index("retweeted").rename(columns={"user": "user_id"}).sort_values("timestamp")

In [5]:
retweets_export.shape

(25566698, 4)

In [6]:
extract_date(retweets_export['timestamp'][0])

'2020-10-23'

In [7]:
retweets_export["date"] = retweets_export.apply(lambda t: extract_date(t["timestamp"]), axis=1)

retweets_export.head()

Unnamed: 0_level_0,user_id,timestamp,retweetedFrom_user,cluster,date
retweeted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1319609788091817985,1198035481650515974,2020-10-23T16:59:58Z,2836421,0,2020-10-23
1319667770083233792,910642944293654528,2020-10-23T17:00:10Z,327126454,2,2020-10-23
1319550613668175875,372053421,2020-10-23T17:00:12Z,506228902,0,2020-10-23
1310647895033610240,1032786509995421696,2020-10-23T17:00:16Z,764261361044037632,2,2020-10-23
1319677395629006849,380671484,2020-10-23T17:00:27Z,289777188,0,2020-10-23


In [8]:
print(retweets_export.value_counts("date"))

date
2020-11-13    675245
2020-12-04    675054
2020-11-07    668635
2020-11-30    658901
2020-11-12    653637
2020-11-08    651532
2020-12-02    648041
2020-11-05    645642
2020-12-01    644312
2020-11-15    640723
2020-11-29    633232
2020-11-18    623733
2020-11-19    616084
2020-11-17    614764
2020-12-03    614437
2020-12-14    611553
2020-11-06    608799
2020-11-11    607056
2020-12-06    606066
2020-11-20    601064
2020-11-21    599459
2020-11-14    597390
2020-11-10    595891
2020-11-25    595767
2020-12-10    594627
2020-11-09    584638
2020-12-05    579855
2020-11-04    573657
2020-11-26    556641
2020-11-24    549216
2020-11-27    546625
2020-11-16    543611
2020-11-22    521146
2020-12-07    493525
2020-12-12    473139
2020-11-28    472749
2020-12-15    463129
2020-12-08    457307
2020-11-23    452395
2020-12-13    438165
2020-12-09    433909
2020-12-11    376370
2020-11-03    255821
2020-10-25    207132
2020-12-16    203827
2020-10-28    149863
2020-11-02    129096
2020-10-

In [9]:
dates = retweets_export.value_counts("date").index

In [10]:
for d in dates:
    print(d)
    date_retweets = retweets_export[retweets_export["date"] == d][["user_id"]]
    date_retweets.to_csv("./data_export/final/retweets/retweets-{}.csv".format(d), index_label="retweeted_id")

2020-11-13
2020-12-04
2020-11-07
2020-11-30
2020-11-12
2020-11-08
2020-12-02
2020-11-05
2020-12-01
2020-11-15
2020-11-29
2020-11-18
2020-11-19
2020-11-17
2020-12-03
2020-12-14
2020-11-06
2020-11-11
2020-12-06
2020-11-20
2020-11-21
2020-11-14
2020-11-10
2020-11-25
2020-12-10
2020-11-09
2020-12-05
2020-11-04
2020-11-26
2020-11-24
2020-11-27
2020-11-16
2020-11-22
2020-12-07
2020-12-12
2020-11-28
2020-12-15
2020-12-08
2020-11-23
2020-12-13
2020-12-09
2020-12-11
2020-11-03
2020-10-25
2020-12-16
2020-10-28
2020-11-02
2020-10-27
2020-11-01
2020-10-31
2020-10-24
2020-10-29
2020-10-30
2020-10-23
2020-10-26
