In [1]:
from notebook_utils import setup

setup()

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import json

In [6]:
new_data_map = {}

def set_user_data_from_object(user_data, object):
    user_data["id"] = str(object["id"])
    user_data["created_at"] = object["created_at"]
    user_data["friends_count"] = object["friends_count"]
    user_data["name"] = object["name"]
    user_data["verified"] = object["verified"]
    user_data["followers_count"] = object["followers_count"]
    user_data["location"] = object["location"]
    user_data["handle"] = object["screen_name"]

def load_user_from_json(json_str):
    user_object = json.loads(json_str)
    user_data = {}
    set_user_data_from_object(user_data, user_object)
    new_data_map[str(user_object["id"])] = user_data

    if (len(new_data_map) % 100000 == 0):
        print("processed", len(new_data_map))

with open("../data/fetch_users/users_retweets.jsonl", "r") as f:
    for line in f.readlines():
        load_user_from_json(line)

with open("../data/fetch_users/users_retweets_2.jsonl", "r") as f:
    for line in f.readlines():
        load_user_from_json(line)

with open("../data/fetch_users/users_tweets.jsonl", "r") as f:
    for line in f.readlines():
        load_user_from_json(line)

processed 100000
processed 200000
processed 300000
processed 400000
processed 500000
processed 600000
processed 700000
processed 800000
processed 900000
processed 1000000
processed 1100000
processed 1200000
processed 1300000
processed 1400000
processed 1500000
processed 1600000
processed 1700000
processed 1800000
processed 1900000
processed 2000000
processed 2100000
processed 2200000
processed 2300000


In [None]:
df_users_with_clustering = pd.read_pickle('./df_users_with_clustering.pickle')

In [7]:
df_users_with_clustering.shape[0]

1388621

In [8]:
user_ids_with_data = set(new_data_map.keys()).union(set(df_users_with_clustering.index))

print(len(user_ids_with_data))

2488725


In [9]:
inactive_user_ids = set()
suspended_user_ids = set()
error_active_user_ids = set()
failed_requests = set()

with open("../data/fetch_users/inactive_user_reason_all.jsonl", "r") as f:
    for line in f.readlines():
        json_row = json.loads(line)
        user_id = json_row["user_id"]
        if (user_id != ""):
            if (json_row["code"] == 63):
                suspended_user_ids.add(json_row["user_id"])
            elif (json_row["code"] == 50):
                inactive_user_ids.add(json_row["user_id"])
            elif (json_row["code"] == -1):
                error_active_user_ids.add(json_row["user_id"])
            elif ("Failed to send request" in json_row["reason"] or "status code = 503" in json_row["reason"]):
                failed_requests.add(json_row["user_id"])
            else:
                raise "Unknown code"

In [11]:
all_user_ids = user_ids_with_data.union(inactive_user_ids, suspended_user_ids, error_active_user_ids)
total_count = len(all_user_ids)
print("All users", total_count)
print("User ids with data: {} ({:,.2f}%)".format(len(user_ids_with_data), len(user_ids_with_data) / total_count * 100))
print("Inactive user ids: {} ({:,.2f}%)".format(len(inactive_user_ids), len(inactive_user_ids) / total_count * 100))
print("Suspended user ids: {} ({:,.2f}%)".format(len(suspended_user_ids), len(suspended_user_ids) / total_count * 100))
print()
print("Missing:")
print("Active", len(error_active_user_ids - inactive_user_ids - user_ids_with_data - suspended_user_ids))
print("Failed requests", len(failed_requests - user_ids_with_data - inactive_user_ids - suspended_user_ids))

All users 2559018
User ids with data: 2488725 (97.25%)
Inactive user ids: 115868 (4.53%)
Suspended user ids: 99884 (3.90%)

Missing:
Active 0
Failed requests 0


In [30]:
df_users_with_clustering[]

Unnamed: 0_level_0,protected,friends,created_at,name,friends_count,verified,followers_count,location,followed_cnts,handle,url,clustering_directed_unweighted,clustering_directed_weighted,clustering_undirected_unweighted
datastore_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1334881524664193027,False,,2020-12-04T15:25:47Z,venton talley,8,False,3,,0,TalleyVenton,,-1,-1,-1
1334881805024043010,False,,2020-12-04T15:28:30Z,üá∫üá∏Freedom Fighterüá∫üá∏,1028,False,41,MAGA Country,0,RiSeAgAiN888,,35043,2,0
1334882729859682310,False,,2020-12-04T15:30:26Z,Hypocritical Liberal Wannabe TB Aidan,0,False,2,,0,LiberalTb,,-1,-1,-1
1334883015265226752,False,,2020-12-04T15:32:10Z,Conversing Post,128,False,2,,0,ConversingPost,,-1,-1,-1
1334883045988474886,False,,2020-12-04T15:31:56Z,Ajmira sultana,62,False,17,,0,SultanaAjmira,,-1,-1,-1


In [31]:
def init_user_data(user_id):
    return {
        "id": user_id
    }

def set_user_data_from_df_row(user_data, row):
    user_data["created_at"] = row["created_at"]
    user_data["friends_count"] = row["friends_count"]
    user_data["name"] = row["name"]
    user_data["verified"] = row["verified"]
    user_data["followers_count"] = row["followers_count"]
    user_data["location"] = row["location"]
    user_data["handle"] = row["handle"]

In [32]:
active_user_ids = all_user_ids - suspended_user_ids - inactive_user_ids

In [33]:
import pickle 
with open("./closeness.pickle", "rb") as f:
    closeness = pickle.load(f)
with open("./graph_with_communities.pickle", "rb") as f:
    graph_with_communities = pickle.load(f)

In [36]:
user_graph_data = {}

def get_closeness(node_id, closeness_type):
    if node_id in closeness[closeness_type]:
        return closeness[closeness_type][node_id]

for node_id, data in graph_with_communities.nodes(data=True):
    user_graph_data[data["user_id"]] = {
        "community": data["community"],
        "l_closeness": get_closeness(node_id, "l_closeness"),
        "r_closeness": get_closeness(node_id, "r_closeness"),
        "node_id": node_id
    }

len(user_graph_data)

1697944

In [37]:
old_data_ids = set(df_users_with_clustering.index)

In [38]:
user_data_map = {}
for i, user_id in enumerate(all_user_ids):
    # Update user if data is not available
    if user_id not in new_data_map:
        user_data = init_user_data(user_id)
        if (user_id in old_data_ids):
            set_user_data_from_df_row(user_data, df_users_with_clustering.loc[user_id])
            user_data["data_source"] = "old"
    else:
        user_data = new_data_map[user_id]
        user_data["data_source"] = "new"
    
    if ("old_data" in user_data):
        del user_data["old_data"]

    if (user_id in suspended_user_ids):
        user_data["active_status"] = "suspended"
    elif (user_id in inactive_user_ids):
        user_data["active_status"] = "inactive"
    elif (user_id in active_user_ids):
        user_data["active_status"] = "active"
    else:
        user_data["active_status"] = "unknown"

    if (user_id in user_graph_data):
        user_data["cluster"] = user_graph_data[user_id]["community"]
        user_data["l_closeness"] = user_graph_data[user_id]["l_closeness"]
        user_data["r_closeness"] = user_graph_data[user_id]["r_closeness"]

    user_data_map[user_id] = user_data
    if (i % 100000 == 0):
        print("Processed", i)

Processed 0
Processed 100000
Processed 200000
Processed 300000
Processed 400000
Processed 500000
Processed 600000
Processed 700000
Processed 800000
Processed 900000
Processed 1000000
Processed 1100000
Processed 1200000
Processed 1300000
Processed 1400000
Processed 1500000
Processed 1600000
Processed 1700000
Processed 1800000
Processed 1900000
Processed 2000000
Processed 2100000
Processed 2200000
Processed 2300000
Processed 2400000
Processed 2500000


In [3]:
df_users_final = pd.read_pickle("./df_users_final.pickle")

In [39]:
df_users_final = pd.DataFrame.from_dict(user_data_map, orient="index").set_index("id").astype({
    "followers_count": 'Int64', 
    "cluster": 'Int64', 
    "friends_count": "Int64"
})

In [40]:
df_users_final.to_pickle("./df_users_final.pickle")

In [4]:
# df_users_final = df_users_final.drop("old_data", axis=1)
print(df_users_final.info())
df_users_final.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2559018 entries, 3288305608 to 1313243498
Data columns (total 12 columns):
 #   Column           Dtype  
---  ------           -----  
 0   created_at       object 
 1   friends_count    Int64  
 2   name             object 
 3   verified         object 
 4   followers_count  Int64  
 5   location         object 
 6   handle           object 
 7   data_source      object 
 8   active_status    object 
 9   cluster          Int64  
 10  l_closeness      float64
 11  r_closeness      float64
dtypes: Int64(3), float64(2), object(7)
memory usage: 261.1+ MB
None


Unnamed: 0_level_0,created_at,friends_count,name,verified,followers_count,location,handle,data_source,active_status,cluster,l_closeness,r_closeness
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3288305608,Mon May 18 11:23:04 +0000 2015,626,kastro J.B üî¥,False,157,"London, England",BeainiKastro,new,active,1.0,,
1333163755295924224,Sun Nov 29 21:39:55 +0000 2020,93,ConfederacyofDunces,False,9,"Nashville, TN",RobVM3,new,active,,,
1325878139537354752,Mon Nov 09 19:10:57 +0000 2020,135,KBR,False,18,,KBR02093679,new,active,,,
304050894,Mon May 23 21:07:18 +0000 2011,77,Giuseppe,False,69,Western Air Temple,PizzaEquality,new,active,0.0,,
1305445745328361472,Mon Sep 14 09:58:30 +0000 2020,22,Carolina Belardinelli,False,2,,CarolinaBelard2,new,active,1.0,,


In [28]:
suspended_users = df_users_final[df_users_final.active_status == "suspended"]
cluster_suspended_distribution = suspended_users["cluster"].value_counts(dropna=False).reset_index()
print(cluster_suspended_distribution)

promoter_cluster = cluster_suspended_distribution[cluster_suspended_distribution["index"].isin([1, 2, 3, 4])]

promoter_cluster["cluster"].sum() / cluster_suspended_distribution[cluster_suspended_distribution["index"].isin([1, 2, 3, 4, 0])]["cluster"].sum()

  index  cluster
0     2    46031
1  <NA>    25956
2     1    18368
3     0     8659
4     3      513
5     4      357


0.8828725246185478

In [42]:
print(df_users_final["cluster"].value_counts(dropna=False))
print(df_users_final["data_source"].value_counts(dropna=False))

NaN    861074
0      860976
1      437783
2      342184
3       33587
4       23414
Name: cluster, dtype: Int64
new    2335974
old     152751
NaN      70293
Name: data_source, dtype: int64


In [144]:
df_users_final.head()

Unnamed: 0_level_0,created_at,friends_count,name,verified,followers_count,location,handle,data_source,active_status,cluster,l_closeness,r_closeness
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1088252070216355840,Thu Jan 24 01:48:07 +0000 2019,1105,Âíå„ÅÆÁßòË®£,False,969,"Osaka City, Osaka",wawoshirasu,new,active,3.0,,
820306084690677760,Sat Jan 14 16:26:25 +0000 2017,2308,Ally1984,False,2254,"Glasgow, Scotland",wyliedunn1,new,active,,,
984900693315108864,Fri Apr 13 21:06:38 +0000 2018,81,The Middle Finger,False,27,Mars,ThemiddleFinge3,new,active,1.0,,
746943161839165449,Sun Jun 26 05:48:22 +0000 2016,28,fightforfreedomstandwithhk,False,7,,HKdemocracy_now,new,active,1.0,,
824061162136436736,Wed Jan 25 01:07:46 +0000 2017,718,Jesse Holder,False,45,"Newburgh, IN",Dont_interupt_m,new,active,0.0,,


In [145]:
print(df_users_final["active_status"].value_counts(dropna=False))
print(df_users_final["data_source"].value_counts(dropna=False))

active       2343266
inactive      115868
suspended      99884
Name: active_status, dtype: int64
new    2335974
old     152751
NaN      70293
Name: data_source, dtype: int64


In [146]:
def agg_activity(agg_type):
    def inner(series):
        status_list = series.tolist()
        status_list = filter(lambda status: status == agg_type, status_list)
        return len(list(status_list))
    return inner

grouped_by_cluster = df_users_final.groupby(['cluster']).agg(
    users=("handle", "count"),
    inactive=("active_status", agg_activity("inactive")),
    suspended=("active_status", agg_activity("suspended")),
    l_centrality_count=("l_closeness", "count"),
    l_closeness_min=("l_closeness", "min"),
    l_closeness_max=("l_closeness", "max"),
    l_closeness_mean=("l_closeness", "mean"),
    r_centrality_count=("r_closeness", "count"),
    r_closeness_min=("r_closeness", "min"),
    r_closeness_max=("r_closeness", "max"),
    r_closeness_mean=("r_closeness", "mean"),
)



grouped_by_cluster["suspended_fraction"] = grouped_by_cluster["suspended"] / grouped_by_cluster["users"]
grouped_by_cluster["inactive_fraction"] = grouped_by_cluster["inactive"] / grouped_by_cluster["users"]

grouped_by_cluster

Unnamed: 0_level_0,users,inactive,suspended,l_centrality_count,l_closeness_min,l_closeness_max,l_closeness_mean,r_centrality_count,r_closeness_min,r_closeness_max,r_closeness_mean,suspended_fraction,inactive_fraction
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,852558,6206,8659,10000,0.074668,0.185138,0.090308,0,,,,0.010156,0.007279
1,400313,40528,18368,0,,,,2060,0.197023,0.454827,0.226651,0.045884,0.101241
2,326514,29821,46031,0,,,,7911,0.197006,0.289314,0.207839,0.140977,0.091331
3,33033,319,513,0,,,,13,0.198591,0.210705,0.203503,0.01553,0.009657
4,22961,383,357,0,,,,16,0.197144,0.233668,0.203623,0.015548,0.01668


In [147]:
grouped_by_cluster[["users", "suspended_fraction", "inactive_fraction"]]

Unnamed: 0_level_0,users,suspended_fraction,inactive_fraction
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,852558,0.010156,0.007279
1,400313,0.045884,0.101241
2,326514,0.140977,0.091331
3,33033,0.01553,0.009657
4,22961,0.015548,0.01668


In [None]:
df_users_final[df_users_final["active_status"] == "suspended"].sort_values("followers_count", ascending=False)[:10][["handle", "followers_count", "cluster", "verified"]]

In [None]:
df_users_final.to_pickle("./df_users_final.pickle")

In [50]:
for cluster in range(5):
    print("Top users in cluster:", str(cluster))
    print(df_users_final[df_users_final["cluster"] == cluster]
        .nlargest(10, 'followers_count')
        [["handle", "followers_count", "active_status"]]
    )

for cluster in range(5):
    print("Top suspended users by cluster:", str(cluster))
    print(df_users_final[(df_users_final["cluster"] == cluster) & (df_users_final["active_status"] == "suspended")]
        .nlargest(10, 'followers_count')
        [["handle", "followers_count", "active_status"]]
    )

Top users in cluster: 0
                    handle  followers_count active_status
id                                                       
428333              cnnbrk         60214681        active
759251                 CNN         51983509        active
807095             nytimes         48772873        active
5402612        BBCBreaking         47117376        active
742143            BBCWorld         30503250        active
1339835893  HillaryClinton         30291658        active
5988062       TheEconomist         25319296        active
1652541            Reuters         22886431        active
16303106     StephenAtHome         19227333        active
3108351                WSJ         18406098        active
Top users in cluster: 1
                     handle  followers_count active_status
id                                                        
25073877    realDonaldTrump         87364085     suspended
1367531             FoxNews         20136456        active
52544275        Ivan

In [55]:
for centrality in ["l_closeness", "r_closeness"]:
    print("Top users by closeness centrality:", str(centrality))
    print(df_users_final.nlargest(20, centrality).reset_index()
        [["handle", "followers_count", "active_status", centrality]]
    )

Top users by closeness centrality: l_closeness
             handle  followers_count active_status  l_closeness
0         AriBerman           180501        active     0.185138
1     JohnFetterman           322436        active     0.178555
2        jaketapper          3158331        active     0.177355
3      kylegriffin1          1050332        active     0.176881
4       justinamash           481423        active     0.173159
5        TimAlberta           137115        active     0.172945
6            mkraju           505720        active     0.172216
7      BrendanKeefe            53865        active     0.170961
8         alanfeuer            31391        active     0.169220
9        jimsciutto           463176        active     0.167267
10        bluestein           106843        active     0.167150
11          nytimes         48772873        active     0.166577
12          nytmike           237693        active     0.164705
13        bradheath           128410        active     0.