In [72]:
import pandas as pd
import numpy as np
import os

In [73]:
def load_chunks(directory):
    chunk_dfs = []
    for name in sorted(os.listdir(directory)):
        sub_directory = os.path.join(directory, name)
        if os.path.isdir(sub_directory):
            for filename in sorted(os.listdir(sub_directory)):
                with open(os.path.join(sub_directory, filename), "r", encoding="utf-8") as f:
                    chunk = pd.read_csv(f, encoding = "utf-8")
                    chunk_dfs.append(chunk)
        elif name.endswith(".csv"):
            chunk = pd.read_csv(os.path.join(directory, name), encoding = "utf-8")
            chunk_dfs.append(chunk)

    return pd.concat(chunk_dfs)

## Tweets

In [74]:
df_tweets = load_chunks("./data/tweets/").set_index("tweet_id")
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7603103 entries, 1319685030826938369 to 1339195780704382980
Data columns (total 16 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   user_community                    float64
 1   user_active_status                object 
 2   retweet_count_metadata            int64  
 3   quote_count_metadata              int64  
 4   retweet_count_by_community_0      int64  
 5   quote_count_by_community_0        int64  
 6   retweet_count_by_community_1      int64  
 7   quote_count_by_community_1        int64  
 8   retweet_count_by_community_2      int64  
 9   quote_count_by_community_2        int64  
 10  retweet_count_by_community_3      int64  
 11  quote_count_by_community_3        int64  
 12  retweet_count_by_community_4      int64  
 13  quote_count_by_community_4        int64  
 14  retweet_count_by_suspended_users  int64  
 15  quote_count_by_suspended_users    int64  
dtypes: flo

In [94]:
print("Number of tweets {:,}".format(df_tweets.shape[0]))

Number of tweets 7,603,103


In [76]:
df_tweets.nlargest(10, "retweet_count_metadata")

Unnamed: 0_level_0,user_community,user_active_status,retweet_count_metadata,quote_count_metadata,retweet_count_by_community_0,quote_count_by_community_0,retweet_count_by_community_1,quote_count_by_community_1,retweet_count_by_community_2,quote_count_by_community_2,retweet_count_by_community_3,quote_count_by_community_3,retweet_count_by_community_4,quote_count_by_community_4,retweet_count_by_suspended_users,quote_count_by_suspended_users
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1324357146088267777,0.0,active,205823,8271,38097,386,28,25,20,15,0,0,0,0,110,13
1324420619539271680,0.0,active,149829,9116,21819,340,17,30,50,35,1,0,5,1,140,21
1332352538855747584,1.0,suspended,91902,27139,267,3891,13600,1653,26685,5535,1347,163,1269,191,7702,1588
1338871862315667456,1.0,suspended,79140,14591,243,2565,15765,1346,38812,5010,1553,97,1573,104,11883,1418
1330148622898515969,1.0,suspended,78904,13190,308,1727,12798,766,28892,2852,1386,129,1380,80,8981,934
1336811823232921600,1.0,suspended,76171,14055,195,761,16194,1814,29451,5575,2330,362,1446,167,8863,1691
1325432465415163904,1.0,suspended,75385,13632,168,617,6437,434,13040,1839,734,72,721,61,4165,574
1332317394165968899,1.0,suspended,73664,9768,141,318,11602,1109,25945,3525,1599,150,1165,140,7722,936
1327750127679889409,1.0,suspended,71054,11185,224,1666,12062,892,29032,3533,1364,101,1418,99,9181,1068
1337745268591259648,1.0,suspended,66741,54083,211,3043,13964,6612,27312,16198,1593,861,974,380,8307,4525


## Retweets

In [77]:
df_retweets = load_chunks("./data/retweets/")
df_retweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25566698 entries, 0 to 203826
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   retweeted_id  int64
 1   user_id       int64
dtypes: int64(2)
memory usage: 585.2 MB


In [92]:
print("Number of retweets {:,}".format(df_retweets.shape[0]))

Number of retweets 25,566,698


In [79]:
df_retweets.head()

Unnamed: 0,retweeted_id,user_id
0,1319609788091817985,1198035481650515974
1,1319667770083233792,910642944293654528
2,1319550613668175875,372053421
3,1310647895033610240,1032786509995421696
4,1319677395629006849,380671484


## Users

In [80]:
df_users = load_chunks("./data/users/").set_index("user_id")
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2559018 entries, 6590 to 1269104794511147008
Data columns (total 16 columns):
 #   Column                                  Dtype  
---  ------                                  -----  
 0   user_community                          float64
 1   user_active_status                      object 
 2   closeness_centrality_detractor_cluster  float64
 3   closeness_centrality_promoter_cluster   float64
 4   retweet_count_by_community_0            int64  
 5   quote_count_by_community_0              int64  
 6   retweet_count_by_community_1            int64  
 7   quote_count_by_community_1              int64  
 8   retweet_count_by_community_2            int64  
 9   quote_count_by_community_2              int64  
 10  retweet_count_by_community_3            int64  
 11  quote_count_by_community_3              int64  
 12  retweet_count_by_community_4            int64  
 13  quote_count_by_community_4              int64  
 14  retweet_count_by_su

In [96]:
print("Number of users {:,}".format(df_users.shape[0]))

Number of users 2,559,018


In [82]:
df_users.nlargest(10, "closeness_centrality_promoter_cluster")

Unnamed: 0_level_0,user_community,user_active_status,closeness_centrality_detractor_cluster,closeness_centrality_promoter_cluster,retweet_count_by_community_0,quote_count_by_community_0,retweet_count_by_community_1,quote_count_by_community_1,retweet_count_by_community_2,quote_count_by_community_2,retweet_count_by_community_3,quote_count_by_community_3,retweet_count_by_community_4,quote_count_by_community_4,retweet_count_by_suspended_users,quote_count_by_suspended_users
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
25073877,1.0,suspended,,0.454827,6257,29941,292242,26607,863596,94552,37414,3556,32117,2326,291398,28156
187680645,1.0,suspended,,0.398071,927,743,54936,8725,777109,63310,11434,1251,5607,395,321090,21483
770781940341288960,1.0,active,,0.367028,422,1912,23522,2077,152796,11181,4353,286,4636,212,53879,3358
26487169,1.0,active,,0.364001,224,345,13566,1761,103977,10384,778,115,1952,329,37425,3012
4041824789,1.0,active,,0.360909,268,336,9739,979,77775,6534,803,96,900,73,29869,1923
240454812,1.0,suspended,,0.360518,379,463,26678,2449,242446,15196,2548,237,2233,179,103916,5731
586707638,1.0,suspended,,0.350383,483,297,38270,4906,453157,32319,5647,313,4071,240,186173,11241
2853461537,1.0,active,,0.347768,375,474,12410,1095,69144,5239,329,34,616,27,20201,1218
16989178,1.0,active,,0.345512,464,285,34641,4673,172243,21370,564,94,1025,184,57993,6525
18266688,1.0,active,,0.341606,286,107,19250,1326,164306,8086,343,19,1446,59,61545,2416


## Images

In [83]:
images = pd.read_csv("data/images.csv", index_col="unique_id")
images.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 167696 entries, 5327346445844480 to 5114016670154752
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tweet_id  167696 non-null  int64 
 1   a_hash    167696 non-null  object
 2   p_hash    167696 non-null  object
 3   w_hash    167696 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.4+ MB


In [99]:
print("Number of images {:,}".format(images.shape[0]))
print("Tweets           {:,}".format(images.tweet_id.nunique()))
print("Unique aHashes   {:,}".format(images.a_hash.nunique()))
print("Unique pHashes   {:,}".format(images.p_hash.nunique()))
print("Unique wHashes   {:,}".format(images.w_hash.nunique()))

Number of images 167,696
Tweets           145,287
Unique aHashes   102,521
Unique pHashes   109,312
Unique wHashes   103,508


In [85]:
images.head()

Unnamed: 0_level_0,tweet_id,a_hash,p_hash,w_hash
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5327346445844480,1327172039669665794,6000007e7e7e0000,817e6a80d55f5a66,f4003c7e7f7f2600
6224547934109696,1327172094707187712,f9f8f8f0f0d3f9f9,cbd29ba4628d2d53,f8f8f8f0c081b939
5478141178937344,1327172494080446465,6c747e7c30e1e3bf,c250b56b5e71da07,28743c5c30c1e3ff
6254140300722176,1327173062207238144,ffffc0801f3f7fff,ab7881877e606397,f0f0c080003f3fff
4504283114373120,1327173190859165696,ffddde1e151200c3,bcd2e920876d943e,ffdd9e57150200c3


## Youtube Videos

In [86]:
youtube_videos = pd.read_csv("data/youtube_videos.csv", index_col="video_id")
youtube_videos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12002 entries, psGpIuNh_dU to xYhd5kLmmks
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   video_title                       12002 non-null  object
 1   video_description                 11136 non-null  object
 2   channel_id                        12002 non-null  object
 3   channel_title                     12002 non-null  object
 4   published_at                      12002 non-null  object
 5   tweet_count                       12002 non-null  int64 
 6   retweet_count_metadata            12002 non-null  int64 
 7   quote_count_metadata              12002 non-null  int64 
 8   tweet_count_by_community_0        12002 non-null  int64 
 9   retweet_count_by_community_0      12002 non-null  int64 
 10  quote_count_by_community_0        12002 non-null  int64 
 11  tweet_count_by_community_1        12002 non-null  int64 
 12  retweet

In [100]:
print("Number of youtube videos {:,}".format(youtube_videos.shape[0]))

Number of youtube videos 12,002


In [88]:
youtube_videos.nlargest(10, "retweet_count_metadata")

Unnamed: 0_level_0,video_title,video_description,channel_id,channel_title,published_at,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_by_community_0,retweet_count_by_community_0,...,quote_count_by_community_2,tweet_count_by_community_3,retweet_count_by_community_3,quote_count_by_community_3,tweet_count_by_community_4,retweet_count_by_community_4,quote_count_by_community_4,tweet_count_by_suspended_users,retweet_count_by_suspended_users,quote_count_by_suspended_users
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
psGpIuNh_dU,Articia Bomer Witnessed Dems Counting Spoiled ...,Articia Bomer Witnessed Dems Counting Spoiled ...,UCekdKDk6d8opUrUEXtXD4XQ,Precinct 13,2020-11-23T10:32:13Z,88,11909,1301,0,5,...,1118,0,39,5,0,50,5,34,3527,401
96-BQaIVOpc,LIVE UPDATES: Democrats Try to Steal Election!...,Steven reviews the latest updates in the 2020 ...,UCIveFvW-ARp_B_RckhweNJw,StevenCrowder,2020-11-04T22:49:00Z,333,4739,496,0,5,...,71,0,9,1,0,5,0,31,167,23
VgMPDnWunqs,"AG Barr releases memo on election fraud, DOJ o...",The head of the Justice Department’s elections...,UCNbIDJNNgaRrXOD7VllIMRQ,One America News Network,2020-11-11T23:58:41Z,150,1605,100,1,0,...,28,0,2,0,0,3,0,27,235,8
ANeq2VKPOak,YOUNG PHARAOH INTERVIEWS JAMES OKEEFE- 2020 VO...,YOUNG PHARAOH INTERVIEWS JAMES OKEEFE IN REGAR...,UCtKtmiQ2GlWlqPDzXwrR4NA,YOUNG PHARAOH,2020-11-17T20:00:08Z,43,982,45,0,1,...,16,1,0,0,0,1,0,8,102,6
dB0h_50OfzA,Trump confidant: the president has all the pro...,Steve Mosher of the Population Research Instit...,UCYImiD9L0dMycenfBy2al0Q,LifeSiteNews,2020-11-13T22:14:18Z,27,937,72,0,0,...,76,0,6,0,0,7,0,4,391,21
VDf1j4IQz28,Bobby Piton testifies at AZ Election Fraud Hea...,Bobby Piton testifies at AZ Election Fraud Hea...,UCSKoabidZh5qoTF4Xg96K7A,Three Headed Eagle Alliance,2020-11-30T22:54:54Z,65,919,45,0,0,...,28,0,0,0,0,5,1,9,266,6
V5jQBYALy0g,11.8.20: LAWSUITS on the WAY! What is going on?,Get The Best VPN For 50% Off Today!\nClick Her...,UCzkAzJ2vQgRyEDzHFiKOaQA,AWK NEWS,2020-11-08T14:34:16Z,3,808,133,0,0,...,55,0,2,2,0,1,0,0,109,16
KIlI46HdqKg,Smoking Gun: ES&S (Philly's Computer System) T...,In this video you will see data from the NYT f...,UCIxc8YMkny2KBaD5TQsSbpg,Edward Solomon,2020-11-20T22:54:00Z,13,781,50,0,2,...,44,0,2,1,0,3,0,1,237,12
RCfU2KizzdM,LIVE: Arizona State Senate Judiciary Holds Hea...,LIVE: Arizona State Senate Holds Hearing on El...,UCHqC-yWZ1kri4YzwRSt6RGQ,Right Side Broadcasting Network,2020-12-14T22:54:38Z,148,756,36,1,0,...,20,4,2,0,2,3,0,32,149,5
fveONZpDbiw,Vernon Jones says Georgia’s election fraud cen...,John Solomon Reports: John Interviews Georgia ...,UC588htN7jqso3D80OnGGrAw,Just The News,2020-11-13T03:48:34Z,17,750,29,0,0,...,16,0,0,0,0,0,0,3,153,3


## URLs

In [89]:
urls = pd.read_csv("data/urls.csv", index_col="url")
urls.info()

<class 'pandas.core.frame.DataFrame'>
Index: 138411 entries, https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp to https://www.americanthinker.com/blog/2020/11/project_veritas_reports_on_michigan_election_fraud.html#.X6Q6j0NCFro.twitter
Data columns (total 22 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   domain                            138411 non-null  object
 1   tweet_count                       138411 non-null  int64 
 2   retweet_count_metadata            138411 non-null  int64 
 3   quote_count_metadata              138411 non-null  int64 
 4   tweet_count_by_community_0        138411 non-null  int64 
 5   retweet_count_by_community_0      138411 non-null  int64 
 6   quote_count_by_community_0        138411 non-null  int64 
 7   tweet_count_by_community_1        138411 non-null  int64 
 8   retweet_count_by_community_1      138411 non

In [101]:
print("Number of urls {:,}".format(urls.shape[0]))

Number of urls 138,411


In [91]:
urls.nlargest(10, "retweet_count_metadata")

Unnamed: 0_level_0,domain,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_by_community_0,retweet_count_by_community_0,quote_count_by_community_0,tweet_count_by_community_1,retweet_count_by_community_1,quote_count_by_community_1,...,quote_count_by_community_2,tweet_count_by_community_3,retweet_count_by_community_3,quote_count_by_community_3,tweet_count_by_community_4,retweet_count_by_community_4,quote_count_by_community_4,tweet_count_by_suspended_users,retweet_count_by_suspended_users,quote_count_by_suspended_users
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp,foxnews.com,264,41078,6543,14,61,401,27,3250,264,...,966,4,345,39,8,483,34,62,2555,305
https://www.legislationline.org/download/id/1472/file/3b50795b2d0374cbef5c29766256.pdf,legislationline.org,148,19221,1973,0,24,92,30,2021,161,...,509,0,25,2,0,119,8,26,1793,139
https://www.houstonchronicle.com/politics/texas/article/Texas-Lt-Gov-Dan-Patrick-offers-1-million-15716973.php?utm_campaign=CMS%20Sharing%20Tools%20(Premium)&utm_source=t.co&utm_medium=referral,houstonchronicle.com,56,16627,2602,13,103,272,8,1037,77,...,345,3,74,15,0,67,4,3,1425,106
https://nypost.com/2020/11/11/usps-whistleblower-denies-wapo-claim-he-recanted-allegations/?utm_source=twitter_sitebuttons&utm_medium=site%20buttons&utm_campaign=site%20buttons,nypost.com,85,15297,983,3,13,23,11,648,36,...,224,0,35,1,0,41,1,12,1124,69
https://thetexan.news/limestone-county-individual-charged-with-134-counts-of-voter-fraud-attorney-general-announces/,thetexan.news,280,14525,1459,2,13,21,41,1368,109,...,419,0,17,3,1,102,6,30,1385,138
https://www.zerohedge.com/political/30-states-computer-system-known-be-defective-tallying-votes,zerohedge.com,16,13906,1689,0,7,1,2,487,80,...,526,0,13,2,0,19,3,3,1363,198
https://www.nytimes.com/2012/10/07/us/politics/as-more-vote-by-mail-faulty-ballots-could-impact-elections.html,nytimes.com,105,12193,834,1,24,14,24,1205,92,...,464,2,66,8,0,87,3,13,2562,143
https://www.youtube.com/watch?v=psGpIuNh_dU,youtube.com,8,11827,1288,0,5,4,3,544,146,...,1109,0,39,5,0,50,5,2,3505,398
https://apnews.com/article/election-2020-joe-biden-donald-trump-elections-c809d17b5cd34048e1a5e9bbb4b73cb7,apnews.com,1,11313,1504,0,4,5,1,304,67,...,415,0,6,1,0,8,0,1,1086,151
https://noqreport.com/2020/11/04/90-voter-turnout-in-wisconsin-is-impossible-without-voter-fraud-as-seven-milwaukee-districts-report-higher-than-100/,noqreport.com,257,11036,1329,0,17,3,31,437,54,...,427,4,5,4,2,13,3,42,746,111
