In [1]:
import pandas as pd
from pathlib import Path
import networkx as nx
from collections import defaultdict
from collections import Counter
from tqdm import tqdm
import pickle as pk
import numpy as np
from datetime import datetime as dt
from geopy.distance import great_circle
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 100000000000000000
import matplotlib.pyplot as plt
from random import sample
from random import choices

## BUILD LIST of EDGES FROM FOLLOWEES OF EACH PANEL MEMBER

In [2]:
#### Load the follower followee pairs
FOLLOWER_DATA = Path("/net/data-backedup/twitter-voters/friends_collect/friends_data/")
fnames = list(FOLLOWER_DATA.glob("*"))
PANEL_MEMBERS = set([x.stem for x in fnames])


In [3]:
#### Build the network
el = defaultdict(list)
all_followers = defaultdict(int)
for i,fname in tqdm(enumerate(fnames)):
    with open(fname, "r") as fin:
        n_followed = 0
        for line in fin:
            n_followed = n_followed + 1
            alter = line.strip()
            if alter in PANEL_MEMBERS:
                el[fname.stem].append(alter)
        all_followers[fname.stem] = n_followed

1373158it [40:35, 563.78it/s]


## READ VOTER FILE and FILTER to REGISTERED VOTERS

In [4]:
#### load voter file information


# The old names and their type
SCHEMA = {
"twProfileID": "string",
"vf_source_state" : "string",
"vf_county_code" : "string",
"tsmart_census_id" : "string",
"tsmart_dma": "string",
"voterbase_gender": "string",
"voterbase_race" : "string",
"voterbase_dob": "string",
"voterbase_age" : "Int64",
"tsmart_partisan_score" : np.float64,
"vf_party": "string",
"voterbase_registration_status": "string",
"tsmart_latitude": np.float64,
"tsmart_longitude": np.float64,
"tsmart_presidential_general_turnout_score": np.float64
}

# The left is the old name, the right is the new name
NAMES = {
"twProfileID": "twProfileID",
"vf_source_state" : "state",
"vf_county_code" : "county",
"tsmart_census_id" : "fips_code",
"tsmart_dma": "dma",
"voterbase_gender": "gender",
"voterbase_race" : "race",
"voterbase_dob": "dob",
"voterbase_age" : "age",
"tsmart_partisan_score" : "party_score",
"vf_party": "party_reg",
"voterbase_registration_status": "registered",
"tsmart_latitude": "latitude",
"tsmart_longitude": "longitude",
"tsmart_presidential_general_turnout_score": "turnout_score"
}

voter_file = pd.read_csv("/net/data-backedup/twitter-voters/TSmart-cleaner-Oct2017-rawFormat.tsv", header = 0, delimiter = '\t', usecols = SCHEMA.keys(),
    dtype = SCHEMA)
voter_file = voter_file.rename(columns = NAMES)

voter_file = voter_file.loc[(voter_file.registered == "Registered")]
voter_file.shape

voter_file = voter_file.set_index("twProfileID")


In [5]:
voter_file_userids = set(voter_file.index)

Filter follower lists to registered voters in the voter file

In [6]:
PANEL_MEMBERS = PANEL_MEMBERS.intersection(voter_file_userids)
len(PANEL_MEMBERS)

1216842

In [17]:
### filter users not registered to vote in the voter file, and without any followee
el = {key:followees for key,followees in el.items() if key in PANEL_MEMBERS and len(followees)>0}
### filter users followed that are not in the voter file and registered to vote
el = {key:list(set(followed).intersection(PANEL_MEMBERS)) for key,followed in el.items()}

Build network and get basic descriptives

In [19]:
G = nx.from_dict_of_lists(el, create_using=nx.DiGraph())

In [20]:
nx.number_of_isolates(G)

0

In [21]:
len(G)

1051258

In [22]:
len(G.edges())

17547086

In [23]:
nodes = set(G.nodes())
all_followers = {key:value for key,value in all_followers.items() if key in nodes}

In [24]:
all_followers_l = list(all_followers.values())
print("Proportion of all follower ties in network used:",  len(G.edges())/sum(all_followers_l))

Proportion of all follower ties in network used: 0.03328561103928392


## Pre-process Voter file data

Age and Race

In [25]:
def adjust_age(row):
    _dob = row.dob
    age = row.age
    if _dob is not None and not pd.isna(_dob):
        dob = str(int(_dob))
        assert len(dob) == 8
        parsed_dt = dt.strptime(dob, "%Y%m%d")
        diff = dt(2020, 11, 15) - parsed_dt
        return int(diff.days // 365.25)
    elif age is not None and not pd.isna(age):
        current_year = 2020
        return int(age + (current_year - 2017))
    return None

voter_file["age_old"]  = voter_file["age"]
voter_file["age"] = voter_file.apply(adjust_age, axis=1)

voter_file["binned_age"] = pd.cut(voter_file.age, [0, 18, 30, 50, 65, 100], right = False,
                                  labels = ["<18", "18-29", "30-49", "50-64", "65+"]).astype("string")

bins = [str(pair[0])+ "-" + str(pair[1]-1) for pair in zip([(21+x) for x in range(0,100,5)], [(26+x) for x in range(0,100,5)])][0:-1]
voter_file["binned_age_2"] = pd.cut(voter_file.age, [21+x for x in range(0,100,5)], right = False,
                                  labels = bins).astype("string")


voter_file["race"] = voter_file["race"].replace("Uncoded", "Unknown")
voter_file.loc[voter_file.race == "Native American", "race"] = "Other"



Partisanship variables

In [26]:
voter_file["party_id"] = pd.cut(voter_file.party_score, [0,35,64.999,101], right = False, 
                                labels = ["Republican", "Independent", "Democrat"]).astype("string")

deciles = np.linspace(0,1,11)
deciles_lab = [str(int(round(x,12)*100))+"-"+str(int(round(x,12)*100)+10) for x in deciles[0:(len(deciles)-1)]]
voter_file["party_score_deciles"] = pd.cut(voter_file.party_score, 
                                       voter_file.party_score.quantile(deciles).values,
                                       right = False, labels =deciles_lab,
                                      ).astype("string")

lims = np.linspace(0,100,21)
lims_lab = [str(int(x))+"-"+str(int(x)+5) for x in lims[0:(len(lims)-1)]]
voter_file["party_score_bin"] = pd.cut(voter_file.party_score, 
                         lims,
                         right = False,
                         labels = lims_lab
                        ).astype("string")


voter_file["party_reg"] = voter_file.party_reg.fillna('')
tab = pd.crosstab(voter_file.state, voter_file.party_reg, dropna = False)

states_reg = set(tab.loc[(tab["Republican"] > 0) & (tab["Democrat"] >0)].index)

voter_file["party_reg"] = np.select(
    [(voter_file.party_reg == "Democrat").astype(bool),
     (voter_file.party_reg == "Republican").astype(bool),
     ((voter_file.party_reg == "Independent") | (voter_file.party_reg == "No Party") | ((voter_file.party_reg == "Unaffiliated") & (voter_file.state.fillna('').isin(states_reg)))).astype(bool), 
     (~(voter_file.state.fillna('').isin(states_reg)) | (voter_file.party_reg=='') | (voter_file.party_reg == "Unknown")).astype(bool)],
    ["Democrat", "Republican", "Independent", pd.NA],
    default = "Other"
)

Get pop density from census data

In [29]:
dens_df = pd.read_csv("/home/quint/twitter_network/Aux_Data/pdb2021trv3_us.csv", usecols = ["GIDTR", "LAND_AREA", "Tot_Population_ACS_15_19",
                                                       "URBANIZED_AREA_POP_CEN_2010","URBAN_CLUSTER_POP_CEN_2010","RURAL_POP_CEN_2010"],
                     dtype = {"GIDTR": "string", "LAND_AREA" : np.float64, "Tot_Population_ACS_15_19":"Int64",
                             "URBANIZED_AREA_POP_CEN_2010": "Int64", "URBAN_CLUSTER_POP_CEN_2010": "Int64", "RURAL_POP_CEN_2010":"Int64"},
                     encoding='latin1')
dens_df = dens_df.dropna()
dens_df["density"] = dens_df.Tot_Population_ACS_15_19 / dens_df.LAND_AREA 

mapping = dens_df.set_index("GIDTR")["density"].to_dict()
voter_file["tract_density"] = voter_file["fips_code"].str[:11].apply(lambda x: mapping.get(x, np.nan))

deciles = np.linspace(0,1,11)
deciles_lab = [str(int(round(x,12)*100))+"-"+str(int(round(x,12)*100)+10) for x in deciles[0:(len(deciles)-1)]]

voter_file["density_deciles"] = pd.cut(voter_file["tract_density"], 
                  voter_file["tract_density"].quantile(deciles).values,
                       right = False, labels =deciles_lab,
                      ).astype("string")

voter_file["tract_density"].quantile(deciles).values

array([0.00000000e+00, 1.11342586e+02, 4.24980764e+02, 1.00837655e+03,
       1.78039702e+03, 2.61269500e+03, 3.60747126e+03, 4.92961487e+03,
       6.97600000e+03, 1.30810811e+04, 5.52750000e+05])

Get urban rural classification

In [30]:
RUCA_df = pd.read_csv("../Aux_Data/ruca2010standardheads.csv", usecols = ["FIPS", "Classification", "Classification_revised"],
                     dtype = {"FIPS": "string", "Classification":"Int64", "Classification_revised" : np.float64})
RUCA_df = RUCA_df.dropna()
RUCA_df["RUCA_urbanicity"] = RUCA_df.apply(lambda x: "metropolitan" if x["Classification_revised"] < 4 
                                       else ("micropolitan" if x["Classification_revised"]<7 
                                             else ("small_town/rural" if x["Classification_revised"]<99 else "not_coded")),
                                      axis = 1)
mapping3 = RUCA_df.set_index("FIPS")["RUCA_urbanicity"].to_dict()
voter_file["ruca_urbanicity"] = voter_file["fips_code"].str[:11].apply(lambda x: mapping3.get(x, np.nan)).astype("category")


Get #election related tweets

In [31]:
pol_df = pd.read_csv("/home/quint/twitter_network/Aux_Data/Political_users.tsv", sep = "\t", 
                    dtype = {"userid": "string", "Pol_tweets": "Int64"})
pol_df.index = pol_df.userid
pol_df = pol_df.drop("userid", axis = 1)
voter_file = voter_file.join(pol_df)
voter_file["Pol_tweets"]= voter_file.Pol_tweets.fillna(0)

## Get descriptives of sample

In [32]:
for var in ["party_id", "party_reg", "gender", "binned_age", "ruca_urbanicity"]:
    print(round(voter_file[var].value_counts(normalize=True, dropna = False)*100))

Democrat       53.0
Republican     34.0
Independent    14.0
Name: party_id, dtype: Float64
<NA>           43.0
Democrat       24.0
Independent    16.0
Republican     15.0
Other           1.0
Name: party_reg, dtype: float64
Female     54.0
Male       42.0
Unknown     3.0
Name: gender, dtype: Float64
30-49    44.0
18-29    29.0
50-64    19.0
65+       7.0
<NA>      1.0
Name: binned_age, dtype: Float64
metropolitan        88.0
micropolitan         7.0
small_town/rural     5.0
NaN                  0.0
not_coded            0.0
Name: ruca_urbanicity, dtype: float64


In [33]:
round(voter_file.race.value_counts(normalize=True, dropna = False)*100, 1)

Caucasian           83.0
African-American     7.4
Hispanic             4.6
Unknown              2.5
Asian                1.9
Other                0.6
Name: race, dtype: Float64

Read list of edges, make network, and add attributes to it

### Set voter file data as node attributes

In [36]:
nx.set_node_attributes(G, voter_file.party_id.to_dict(), "party_id")
nx.set_node_attributes(G, voter_file.party_reg.to_dict(), "party_reg")
nx.set_node_attributes(G, voter_file.party_score.to_dict(), "party_score")
nx.set_node_attributes(G, voter_file.party_score_deciles.to_dict(), "party_score_decile")
nx.set_node_attributes(G, voter_file.party_score_bin.to_dict(), "party_score_bin")
nx.set_node_attributes(G, voter_file.latitude.to_dict(), "lat")
nx.set_node_attributes(G, voter_file.longitude.to_dict(), "lon")
nx.set_node_attributes(G, voter_file.registered.to_dict(), "reg")
nx.set_node_attributes(G, voter_file.race.to_dict(), "race")
nx.set_node_attributes(G, voter_file.age.to_dict(), "age")
nx.set_node_attributes(G, voter_file.binned_age.to_dict(), "binned_age")
nx.set_node_attributes(G, voter_file.binned_age_2.to_dict(), "binned_age_2")
nx.set_node_attributes(G, voter_file.gender.to_dict(), "gender")
nx.set_node_attributes(G, voter_file.state.to_dict(), "state")
nx.set_node_attributes(G, voter_file.county.to_dict(), "county")
nx.set_node_attributes(G, voter_file.fips_code.to_dict(), "fips_code")
nx.set_node_attributes(G, voter_file.tract_density.to_dict(), "tract_density")
nx.set_node_attributes(G, voter_file.density_deciles.to_dict(), "density_deciles")
nx.set_node_attributes(G, voter_file.ruca_urbanicity.to_dict(), "ruca_urbanicity")
nx.set_node_attributes(G, voter_file["fips_code"].str[:11].to_dict(), "fips")
nx.set_node_attributes(G, voter_file.Pol_tweets.to_dict(), "pol_tweets")


Add distance between nodes as edge attribute

In [37]:
lat_d = nx.get_node_attributes(G,"lat")
lon_d = nx.get_node_attributes(G,"lon")

attr = {(edge[0], edge[1]):None for edge in G.edges()}

# this loops through the real pairs
for edge in tqdm(G.edges()): #tqdm()
    # getting cord by node keys
    lat1 = lat_d[edge[0]]
    lon1 = lon_d[edge[0]]
    lat2 = lat_d[edge[1]]
    lon2 = lon_d[edge[1]]
    if not pd.isna(lat1) and not pd.isna(lon1) and not pd.isna(lat2) and not pd.isna(lon2):
        coord1 = (lat1, lon1)
        coord2 = (lat2, lon2)
        dist = great_circle(coord1, coord2).km
        attr[(edge[0],edge[1])] = dist

nx.set_edge_attributes(G, attr, "distance")

100%|██████████████████████████████████████████████████████████████████████████████| 17547086/17547086 [09:52<00:00, 29602.59it/s]


Add radiation pop as attribute (requires running Radiation_model first)

In [38]:
tie_pop = pk.load(open("/home/quint/twitter_network/Pickles/radiation_pop.pk", "rb"))
nx.set_edge_attributes(G, tie_pop, "radiation_pop")

### Save Network

In [39]:
pk.dump(G, open("/home/quint/twitter_network/Pickles/follower_graph.pk", "wb"))

### Build reciprocal network and save it

In [41]:
print(nx.overall_reciprocity(G))
H = G.to_undirected(reciprocal = True)
nodes_filter = list(nx.isolates(H))
H.remove_nodes_from(nodes_filter)

0.4376111224393612


In [42]:
pk.dump(H, open("/home/quint/twitter_network/Pickles/follower_graph_reciprocal.pk", "wb"))