In [21]:
from notebook_utils import setup
import pandas as pd
import networkx as nx
from collections import defaultdict

setup()

import matplotlib.style as style

style.use(["seaborn-white", "seaborn-paper"])

def map_user_status(status):
    if (status == "inactive"):
        return "deleted"
    else:
        return status

rename_dict = {}
for i in range(5):
    rename_dict["retweets_by_cluster_" + str(i)] = "retweet_count_by_community_" + str(i)
    rename_dict["quotes_by_cluster_" + str(i)] = "quote_count_by_community_" + str(i)
    rename_dict["tweets_by_cluster_" + str(i)] = "tweet_count_by_community_" + str(i)
rename_dict["tweets_by_suspended"] = "tweet_count_by_suspended_users"
rename_dict["quotes_by_suspended"] = "quote_count_by_suspended_users"
rename_dict["retweets_by_suspended"] = "retweet_count_by_suspended_users"
rename_dict["author_active_status"] = "user_active_status" 
rename_dict["l_closeness"] = "closeness_centrality_detractor_cluster"
rename_dict["r_closeness"] = "closeness_centrality_promoter_cluster"
rename_dict["quote_count"] = "quote_count_metadata"
rename_dict["retweet_count"] = "retweet_count_metadata"
rename_dict["cluster"] = "user_community"
rename_dict["user_cluster"] = "user_community"
rename_dict["active_status"] = "user_active_status"

rename_dict["retweets_by_l"] = "retweet_count_by_detractors"
rename_dict["retweets_by_r"] = "retweet_count_by_promoters"
rename_dict["retweets_crawled"] = "retweet_count_streamed"


## Exception for user
user_rename_dict = {**rename_dict}
for i in range(5):
    user_rename_dict["retweets_by_cluster_" + str(i)] = "retweet_count_by_community_" + str(i)
    user_rename_dict["quotes_by_cluster_" + str(i)] = "quote_count_by_community_" + str(i)
    user_rename_dict["tweets_by_cluster_" + str(i)] = "tweet_count_by_community_" + str(i)
user_rename_dict["retweets_by_suspended"] = "retweet_count_by_suspended_users"
user_rename_dict["quotes_by_suspended"] = "quote_count_by_suspended_users"

user_export_cols = ["user_community", "user_active_status", "closeness_centrality_detractor_cluster", "closeness_centrality_promoter_cluster"]

retweeted_cols = []
for i in range(5):
    retweeted_cols.append("retweet_count_by_community_" + str(i))
    retweeted_cols.append("quote_count_by_community_" + str(i))
retweeted_cols.append("retweet_count_by_suspended_users")
retweeted_cols.append("quote_count_by_suspended_users")
user_export_cols += retweeted_cols

media_share_cols = ["tweet_count", "retweet_count_metadata", "quote_count_metadata"]
for i in range(5):
    media_share_cols.append("tweet_count_by_community_" + str(i))
    media_share_cols.append("retweet_count_by_community_" + str(i))
    media_share_cols.append("quote_count_by_community_" + str(i))
media_share_cols.append("tweet_count_by_suspended_users")
media_share_cols.append("retweet_count_by_suspended_users")
media_share_cols.append("quote_count_by_suspended_users")


tweet_cols = ["user_community", "user_active_status", "retweet_count_metadata", "quote_count_metadata"]

image_cols = ["unique_id", "tweet_id", "a_hash", "p_hash", "w_hash"]
youtube_cols = ["video_title", "video_description", "channel_id", "channel_title", "published_at"] + media_share_cols 
url_cols = ["domain"] + media_share_cols

In [22]:
df_users = pd.read_pickle("./df_users_final_with_metrics.pickle").rename(columns=rename_dict)

In [23]:
cols = ["retweet_count_by_community_" + str(i) for i in range(5)] + ["retweet_count_by_suspended_users", "retweet_count_by_promoters", "retweet_count_by_detractors"]

user_ids = set()

for col in cols:
    user_ids = user_ids.union(set(df_users.nlargest(10, col).index))

In [24]:
len(user_ids)

32

In [25]:
for community in range(5):
    community_users = df_users[df_users.user_community == community]
    user_ids = user_ids.union(set(community_users.nlargest(10, "retweet_count_streamed").index))

KeyError: 'total_retweet_count'

In [None]:
relevant_users = df_users.loc[user_ids]
print(len(relevant_users))
relevant_users.head()

In [None]:
relevant_users.to_pickle("../interface/data/top_users.pickle")

In [10]:
df_recent_tweets = pd.read_pickle("./df_recent_tweets_with_final_metrics.pickle").rename(columns=rename_dict)

In [11]:
df_recent_tweets.columns

Index(['datastore_id', 'urls', 'hasMedia', 'hashtags',
       'retweet_count_metadata', 'quote_count_metadata', 'user', 'text',
       'quote_tweet', 'timestamp', 'tokens', 'election fraud', 'voter fraud',
       '#voterfraud', '#stopthesteal', '#ballotharvesting', 'ballot fraud',
       '#electionfraud', '#electioninterference', 'ballot harvesting',
       'election interference', '#electiontampering', '#cheatingdemocrats',
       'election tampering', 'democrats cheat', '#voterfraudisreal',
       'cheating democrats', '#gopvoterfraud', 'destroyed ballots',
       'stolen ballots', '#ballotfraud', 'discarded ballots',
       'hacked voting machine', 'pre-filled ballot', 'harvest ballot',
       '#stopvoterfraud', '#democratvoterfraud', '#ballotvoterfraud',
       '#nomailinvoting', '#ilhanomarballotharvesting', 'vote by mail fraud',
       '#mailinvoterfraud', '#votebymailfraud', '#ilhanomarvoterfraud',
       '#stopgopvoterfraud', '#discardedballots', '#hackedvotingmachines',
      

In [12]:
cols = ["retweet_count_by_community_" + str(i) for i in range(5)] + ["retweet_count_by_suspended_users", "retweet_count_by_promoters", "retweet_count_by_detractors"]

tweet_indices = set()

for col in cols:
    tweet_indices = tweet_indices.union(set(df_recent_tweets.nlargest(10, col).index))

In [13]:
for community in range(5):
    community_tweets = df_recent_tweets[df_recent_tweets.user_community == community]
    tweet_indices = tweet_indices.union(set(community_tweets.nlargest(10, "retweet_count_metadata").index))

In [14]:
len(tweet_indices)

62

In [15]:
relevant_tweets = df_recent_tweets.iloc[list(tweet_indices)]

In [16]:
relevant_tweets.to_pickle("../interface/data/top_tweets.pickle")

In [17]:
df_youtube_videos = pd.read_csv("../data/notebooks/final/youtube_videos.csv", index_col="video_id")
df_hashtag = pd.read_csv("../data/notebooks/final/hashtags.csv", index_col="hashtag")
df_url = pd.read_csv("../data/notebooks/final/urls.csv", index_col="url")

In [18]:
def add_cols(df):
    df["retweet_count_by_detractors"] = df.apply(lambda x: x["retweet_count_by_community_0"], axis=1)
    df["retweet_count_by_promoters"] = df.apply(lambda x: x["retweet_count_by_community_1"] + x["retweet_count_by_community_2"] + x["retweet_count_by_community_3"] + x["retweet_count_by_community_4"], axis=1)

add_cols(df_youtube_videos)
add_cols(df_hashtag)
add_cols(df_url)

In [19]:
df_url.head()

Unnamed: 0_level_0,domain,tweet_count,retweet_count_metadata,quote_count_metadata,tweet_count_by_community_0,retweet_count_by_community_0,quote_count_by_community_0,tweet_count_by_community_1,retweet_count_by_community_1,quote_count_by_community_1,...,retweet_count_by_community_3,quote_count_by_community_3,tweet_count_by_community_4,retweet_count_by_community_4,quote_count_by_community_4,tweet_count_by_suspended_users,retweet_count_by_suspended_users,quote_count_by_suspended_users,retweet_count_by_detractors,retweet_count_by_promoters
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp,foxnews.com,264,41078,6543,14,61,401,27,3250,264,...,345,39,8,483,34,62,2555,305,61,11868
https://www.legislationline.org/download/id/1472/file/3b50795b2d0374cbef5c29766256.pdf,legislationline.org,148,19221,1973,0,24,92,30,2021,161,...,25,2,0,119,8,26,1793,139,24,7793
https://www.houstonchronicle.com/politics/texas/article/Texas-Lt-Gov-Dan-Patrick-offers-1-million-15716973.php?utm_campaign=CMS%20Sharing%20Tools%20(Premium)&utm_source=t.co&utm_medium=referral,houstonchronicle.com,56,16627,2602,13,103,272,8,1037,77,...,74,15,0,67,4,3,1425,106,103,5109
https://nypost.com/2020/11/11/usps-whistleblower-denies-wapo-claim-he-recanted-allegations/?utm_source=twitter_sitebuttons&utm_medium=site%20buttons&utm_campaign=site%20buttons,nypost.com,85,15297,983,3,13,23,11,648,36,...,35,1,0,41,1,12,1124,69,13,4063
https://thetexan.news/limestone-county-individual-charged-with-134-counts-of-voter-fraud-attorney-general-announces/,thetexan.news,280,14525,1459,2,13,21,41,1368,109,...,17,3,1,102,6,30,1385,138,13,5920


In [20]:
df_youtube_videos.to_pickle("../interface/data/top_youtube_videos.pickle")
df_hashtag.to_pickle("../interface/data/top_hashtags.pickle")
df_url.to_pickle("../interface/data/top_urls.pickle")