In [77]:
from notebook_utils import setup
import pandas as pd
import networkx as nx
from collections import defaultdict
import numpy as np
import pickle
import json

setup()
DATAFRAMES_DIR = "../data/dataframes/16-dec/"


def map_user_status(status):
    if (status == "inactive"):
        return "deleted"
    else:
        return status

rename_dict = {}
for i in range(5):
    rename_dict["retweets_by_cluster_" + str(i)] = "retweet_count_by_community_" + str(i)
    rename_dict["quotes_by_cluster_" + str(i)] = "quote_count_by_community_" + str(i)
    rename_dict["tweets_by_cluster_" + str(i)] = "tweet_count_by_community_" + str(i)
rename_dict["tweets_by_suspended"] = "tweet_count_by_suspended_users"
rename_dict["quotes_by_suspended"] = "quote_count_by_suspended_users"
rename_dict["retweets_by_suspended"] = "retweet_count_by_suspended_users"
rename_dict["author_active_status"] = "user_active_status" 
rename_dict["l_closeness"] = "closeness_centrality_detractor_cluster"
rename_dict["r_closeness"] = "closeness_centrality_promoter_cluster"
rename_dict["quote_count"] = "quote_count_metadata"
rename_dict["retweet_count"] = "retweet_count_metadata"
rename_dict["cluster"] = "user_community"
rename_dict["user_cluster"] = "user_community"
rename_dict["active_status"] = "user_active_status"

rename_dict["retweets_by_l"] = "retweet_count_by_detractors"
rename_dict["retweets_by_r"] = "retweet_count_by_promoters"
rename_dict["retweets_crawled"] = "retweet_count_streamed"


## Exception for user
user_rename_dict = {**rename_dict}
for i in range(5):
    user_rename_dict["retweets_by_cluster_" + str(i)] = "retweet_count_by_community_" + str(i)
    user_rename_dict["quotes_by_cluster_" + str(i)] = "quote_count_by_community_" + str(i)
    user_rename_dict["tweets_by_cluster_" + str(i)] = "tweet_count_by_community_" + str(i)
user_rename_dict["retweets_by_suspended"] = "retweet_count_by_suspended_users"
user_rename_dict["quotes_by_suspended"] = "quote_count_by_suspended_users"

user_export_cols = ["user_community", "user_active_status", "closeness_centrality_detractor_cluster", "closeness_centrality_promoter_cluster"]

retweeted_cols = []
for i in range(5):
    retweeted_cols.append("retweet_count_by_community_" + str(i))
    retweeted_cols.append("quote_count_by_community_" + str(i))
retweeted_cols.append("retweet_count_by_suspended_users")
retweeted_cols.append("quote_count_by_suspended_users")
user_export_cols += retweeted_cols

media_share_cols = ["tweet_count", "retweet_count_metadata", "quote_count_metadata"]
for i in range(5):
    media_share_cols.append("tweet_count_by_community_" + str(i))
    media_share_cols.append("retweet_count_by_community_" + str(i))
    media_share_cols.append("quote_count_by_community_" + str(i))
media_share_cols.append("tweet_count_by_suspended_users")
media_share_cols.append("retweet_count_by_suspended_users")
media_share_cols.append("quote_count_by_suspended_users")


tweet_cols = ["user_community", "user_active_status", "retweet_count_metadata", "quote_count_metadata"]

image_cols = ["unique_id", "tweet_id", "a_hash", "p_hash", "w_hash"]
youtube_cols = ["video_title", "video_description", "channel_id", "channel_title", "published_at"] + media_share_cols 
url_cols = ["domain"] + media_share_cols

In [2]:
with open("../data/handleToCandidateInfo.json", "r") as f:
    candidates = json.load(f)

In [3]:
candidate_mapping = {}
for candidate in candidates:
    del candidate['']
    candidate["id"] = str(candidate["id"])
    candidate_mapping[candidate["id"]] = candidate

list(candidate_mapping.items())[0]

('941080085121175600',
 {'handle': 'sendougjones',
  'name': 'Senator Doug Jones',
  'verified': 'True',
  'id': '941080085121175600',
  'state': 'Alabama',
  'party': 'Democratic',
  'position': 'U.S. Senate',
  'candidate_name': 'Doug Jones (Alabama)'})

In [96]:
candidate_mapping["25073877"] = {
    "handle": "realDonaldTrump",
    "name": "Donald Trump",
    "party": "Republican",
    "position": "President",
    "state": "N/A",
    "id": "25073877",
    "verified": True,
    "candidate_name": "Donald Trump"
}

In [9]:
df_retweets = pd.read_pickle("./df_retweets_with_cluster.pickle")

In [10]:
candidate_shares = defaultdict(lambda: [])

for i, user, retweeted in df_retweets[["user", "retweeted"]].itertuples():
    if (user in candidate_mapping):
        candidate_shares[user].append(retweeted)

    if (i % 100000 == 0):
        print("Processed", i)

Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
Processed 1000000
Processed 1100000
Processed 1200000
Processed 1300000
Processed 1400000
Processed 1500000
Processed 1600000
Processed 1700000
Processed 1800000
Processed 1900000
Processed 2000000
Processed 2100000
Processed 2200000
Processed 2300000
Processed 2400000
Processed 2500000
Processed 2600000
Processed 2700000
Processed 2800000
Processed 2900000
Processed 3000000
Processed 3100000
Processed 3200000
Processed 3300000
Processed 3400000
Processed 3500000
Processed 3600000
Processed 3700000
Processed 3800000
Processed 3900000
Processed 4000000
Processed 4100000
Processed 4200000
Processed 4300000
Processed 4400000
Processed 4500000
Processed 4600000
Processed 4700000
Processed 4800000
Processed 4900000
Processed 5000000
Processed 5100000
Processed 5200000
Processed 5300000
Processed 5400000
Processed 5500000
Process

In [11]:
active_candidates = 0
for user_id, shares in candidate_shares.items():
    #print(shares)
    if (len(shares) > 0):
        active_candidates += 1

print("Total candidates", len(candidate_mapping))
print("Retweeting candidates", len(candidate_shares))
print("Trump shares", len(candidate_shares["25073877"]))

Total candidates 2367
Retweeting candidates 133
Trump shares 30


In [15]:
if candidate_shares:
    with open("./candidate_shares.pickle", "wb") as f:
        pickle.dump({
            "candidate_shares": dict(candidate_shares),
            "candidate_map": candidate_mapping,
            "candidate_tweets": candidate_tweets
            "candidate_retweeted_tweets": candidate_retweeted_tweets
        }, f)

In [6]:
with open("./candidate_shares.pickle", "rb") as f:
    candidate_shares = pickle.load(f)

In [7]:
df_recent_tweets = pd.read_pickle("./df_recent_tweets_with_final_metrics.pickle")

In [31]:
candidate_retweeted_ids = set([item for sublist in candidate_shares["candidate_shares"].values() for item in sublist])

In [38]:
candidate_tweet_indices = []
candidate_retweeted_tweet_indices = []

for i, tweet_id, user in df_recent_tweets[["datastore_id", "user"]].itertuples():
    if (user in candidate_mapping):
        candidate_tweet_indices.append(i)
    if tweet_id in candidate_retweeted_ids:
        candidate_retweeted_tweet_indices.append(i)

    if (i % 100000 == 0):
        print("Processed", i)

Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
Processed 1000000
Processed 1100000
Processed 1200000
Processed 1300000
Processed 1400000
Processed 1500000
Processed 1600000
Processed 1700000
Processed 1800000
Processed 1900000
Processed 2000000
Processed 2100000
Processed 2200000
Processed 2300000
Processed 2400000
Processed 2500000
Processed 2600000
Processed 2700000
Processed 2800000
Processed 2900000
Processed 3000000
Processed 3100000
Processed 3200000
Processed 3300000
Processed 3400000
Processed 3500000
Processed 3600000
Processed 3700000
Processed 3800000
Processed 3900000
Processed 4000000
Processed 4100000
Processed 4200000
Processed 4300000
Processed 4400000
Processed 4500000
Processed 4600000
Processed 4700000
Processed 4800000
Processed 4900000
Processed 5000000
Processed 5100000
Processed 5200000
Processed 5300000
Processed 5400000
Processed 5500000
Process

In [39]:
candidate_tweets = df_recent_tweets[df_recent_tweets.index.isin(candidate_tweet_indices)]
candidate_retweeted_tweets = df_recent_tweets[df_recent_tweets.index.isin(candidate_retweeted_tweet_indices)]

In [40]:
print("Candidate tweets", len(candidate_tweets))
print("Candidate retweeted tweets", len(candidate_retweeted_tweets))
print("Tweeting candidates", candidate_tweets["user"].nunique())

Candidate tweets 2440
Candidate retweeted tweets 806
Tweeting candidates 285


In [34]:
candidate_tweets.head()

Unnamed: 0,datastore_id,urls,hasMedia,hashtags,retweet_count,quote_count,user,text,quote_tweet,timestamp,...,author_active_status,retweets_by_suspended,quotes_by_suspended,retweets_crawled,quotes_crawled,retweets_by_l,retweets_by_r,retweets_suspended_ratio,quote_coverage,retweet_coverage
2598,1327181796337967105,[https://twitter.com/i/web/status/132718179633...,False,[],2504,253,364214133,"Dear @Jack:\n\nOn some of my tweets, you’ve pl...",,2020-11-13T09:29:32Z,...,active,365,29,2177,232,2,1739,0.167662,0.916996,0.869409
9781,1327204881992650752,[https://twitter.com/i/web/status/132720488199...,False,[],297,44,4381665136,"With the way these elections are going, and th...",,2020-11-13T11:01:16Z,...,active,83,6,271,37,4,241,0.306273,0.840909,0.912458
24907,1327234252635844608,[https://twitter.com/i/web/status/132723425263...,False,[],0,0,700359998,"Yes, and time more people spoke the truth. Ins...",1.3259442181209088e+18,2020-11-13T12:57:59Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
27187,1327237398317723648,[https://twitter.com/i/web/status/132723739831...,False,[MailInBallots],0,0,28731208,@jonathanoosting @BridgeMichigan How do you kn...,,2020-11-13T13:10:29Z,...,inactive,0,0,0,0,0,0,0.0,0.0,0.0
38539,1327251165713752066,[https://twitter.com/i/web/status/132725116571...,False,[realdonaldtrump],0,0,3394405942,Days after this election with the obvious frau...,1.326970781209944e+18,2020-11-13T14:05:11Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0


In [89]:
if candidate_shares:
    with open("./candidate_shares.pickle", "wb") as f:
        pickle.dump({
            "candidate_shares": candidate_shares["candidate_shares"],
            "candidate_map": candidate_mapping,
            "candidate_tweets": candidate_tweets.rename(columns=rename_dict),
            "candidate_retweeted_tweets": candidate_retweeted_tweets.rename(columns=rename_dict),
            "df_candidate_users": df_candidate_users
        }, f)

In [86]:
candidate_tweets.head()

Unnamed: 0,datastore_id,urls,hasMedia,hashtags,retweet_count_metadata,quote_count_metadata,user,text,quote_tweet,timestamp,...,user_active_status,retweet_count_by_suspended_users,quote_count_by_suspended_users,retweet_count_streamed,quotes_crawled,retweet_count_by_detractors,retweet_count_by_promoters,retweets_suspended_ratio,quote_coverage,retweet_coverage
2598,1327181796337967105,[https://twitter.com/i/web/status/132718179633...,False,[],2504,253,364214133,"Dear @Jack:\n\nOn some of my tweets, you’ve pl...",,2020-11-13T09:29:32Z,...,active,365,29,2177,232,2,1739,0.167662,0.916996,0.869409
9781,1327204881992650752,[https://twitter.com/i/web/status/132720488199...,False,[],297,44,4381665136,"With the way these elections are going, and th...",,2020-11-13T11:01:16Z,...,active,83,6,271,37,4,241,0.306273,0.840909,0.912458
24907,1327234252635844608,[https://twitter.com/i/web/status/132723425263...,False,[],0,0,700359998,"Yes, and time more people spoke the truth. Ins...",1.3259442181209088e+18,2020-11-13T12:57:59Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0
27187,1327237398317723648,[https://twitter.com/i/web/status/132723739831...,False,[MailInBallots],0,0,28731208,@jonathanoosting @BridgeMichigan How do you kn...,,2020-11-13T13:10:29Z,...,inactive,0,0,0,0,0,0,0.0,0.0,0.0
38539,1327251165713752066,[https://twitter.com/i/web/status/132725116571...,False,[realdonaldtrump],0,0,3394405942,Days after this election with the obvious frau...,1.326970781209944e+18,2020-11-13T14:05:11Z,...,active,0,0,0,0,0,0,0.0,0.0,0.0


In [46]:
df_users = pd.read_pickle("./df_users_final_with_metrics.pickle")

In [81]:
df_candidate_users = df_users[df_users.index.isin(candidate_mapping.keys())].rename(columns=user_rename_dict)

In [68]:
df_candidate_users[df_candidate_users.active_status == "suspended"]

Unnamed: 0_level_0,created_at,friends_count,name,verified,followers_count,location,handle,data_source,active_status,cluster,...,quotes_by_cluster_3,retweets_by_cluster_4,quotes_by_cluster_4,retweets_by_suspended,quotes_by_suspended,retweets_crawled,quotes_crawled,retweets_by_l,retweets_by_r,retweets_suspended_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25073877,2009-03-18T13:46:38Z,50,Donald J. Trump,True,87364085,"Washington, DC",realDonaldTrump,old,suspended,1,...,3556,32117,2326,291398,28156,1560373,231142,6257,1225369,0.186749
169074496,2010-07-21T13:47:41Z,157,🇺🇸 Trump2020 🇺🇸,False,4144,Hyde HQ,rfhyde1,old,suspended,2,...,0,0,0,33,0,63,2,0,63,0.52381
16740433,2008-10-14T16:08:50Z,73563,DeAnna Lorraine 🇺🇸,True,393461,"Quarantined, CA",DeAnna4Congress,old,suspended,2,...,0,0,0,4,0,14,0,0,13,0.285714


In [99]:
df_candidate_users["party"] =df_candidate_users.apply(lambda u: candidate_mapping[u.name]["party"], axis=1)
df_candidate_users["position"] = df_candidate_users.apply(lambda u: candidate_mapping[u.name]["position"], axis=1)
df_candidate_users["state"] = df_candidate_users.apply(lambda u: candidate_mapping[u.name]["state"], axis=1)
df_candidate_users["candidate_name"] = df_candidate_users.apply(lambda u: candidate_mapping[u.name]["candidate_name"], axis=1)
df_candidate_users["handle"] = df_candidate_users.apply(lambda u: "@" + u["handle"], axis=1)

In [101]:
df_candidate_users[[
    "candidate_name", "handle", "party", "user_community", "position", "state", 
    "followers_count", "user_active_status", "retweet_count_by_detractors", "retweet_count_by_promoters", "retweet_count_by_suspended_users"]].to_pickle("../interface/data/candidate_users.pickle")

In [104]:
df_candidate_users["user_community"].value_counts()

0    151
1     85
2     53
Name: user_community, dtype: Int64