In [103]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import seaborn as sns
from scipy.stats import gaussian_kde
from scipy.integrate import quad
from matplotlib.patches import Patch

# replace with your own relative file path
df = pd.read_csv(r"C:\Users\justi\OneDrive\Desktop\BU RISE\github\data_twitter\clean_tweets_directed.csv", sep = ",")
len(df)

3655957

In [104]:
from_counts = df['From-User-Id'].value_counts()
to_counts = df['To-User-Id'].value_counts()
total_edge_counts = from_counts.add(to_counts, fill_value=0).astype(int)

df['User_Edge_Count'] = df['From-User-Id'].map(total_edge_counts)

edge_threshold = total_edge_counts.quantile(0.95)
top_percentile_users = total_edge_counts[total_edge_counts > edge_threshold]
big_influencers = set(top_percentile_users.index)

print(f"Number of users in top: {len(top_percentile_users)}")
print(f"Top percentile edge threshold: {edge_threshold}")

df['is_big_influencer'] = df['From-User-Id'].isin(top_percentile_users.index)
df['to_big_influencer'] = df['To-User-Id'].isin(top_percentile_users.index)

Number of users in top: 87706
Top percentile edge threshold: 10.0


In [105]:
'''
Bridging users = users that identify with an ideology,
and who interact with users that align with their own ideologies 
and users who don't
'''

df['PartyName'] = df['PartyName'].str.title()
affil_map = df.groupby('From-User-Id')['Implied_Political_Affiliation'] \
                         .agg(lambda x: x.mode().iloc[0])

df['To-Ideology'] = df['To-User-Id'].map(affil_map)


df_filtered = df[df['To-Ideology'].isin(['Democrat','Republican'])]
#df_filtered = df

interactions = df_filtered.pivot_table(index='From-User-Id',
               columns='To-Ideology',
               values='Score',   
               aggfunc='count',
               fill_value=0)


interactions['User_Affil'] = affil_map.loc[interactions.index]

bridge_percent_threshold = 0.10  # Adjustable

def has_enough_opposite_interactions(row):
    total_interactions = row.get('Democrat', 0) + row.get('Republican', 0)
    if total_interactions == 0:
        return False  # no ideological interactions at all

    uid = row.name
    is_big = uid in big_influencers
    affil = row['User_Affil']

    if affil == 'Democrat':
        cross_share = row.get('Republican', 0) / total_interactions
    elif affil == 'Republican':
        cross_share = row.get('Democrat', 0) / total_interactions
    else:
        return False  # skip inconclusive users

    if is_big:
        return cross_share >= bridge_percent_threshold
    else:
        return (row.get('Republican', 0) if affil == 'Democrat' 
                else row.get('Democrat', 0)) >= 1  # original rule

bridging_mask = (interactions['Democrat'] > 0) & (interactions['Republican'] > 0)

valid_bridging = interactions[bridging_mask].copy()
valid_bridging['Valid'] = valid_bridging.apply(has_enough_opposite_interactions, axis=1)

bridging_users = valid_bridging[valid_bridging['Valid']].index.tolist()

df['Is_Bridging_User'] = df['From-User-Id'].isin(bridging_users)
df['Is_Recipient_Bridging'] = df['To-User-Id'].isin(bridging_users)

print(f"Number of valid bridging users: {len(bridging_users)}")

Number of valid bridging users: 143651


In [106]:
both_users = df[df['Is_Bridging_User'] & df['is_big_influencer']]['From-User-Id'].unique()
print(f"Number of unique users who are both: {len(both_users)}")

Number of unique users who are both: 53572


In [107]:
# -------- sets of UNIQUE users --------
big_influencers = set(top_percentile_users.index)      # ≈ 8 642 users
bridgers        = set(bridging_users)                  # from earlier step
both_big_and_bridging = big_influencers & bridgers     # intersection

# -------- compute & print rate --------
rate_big_infl_bridging = len(both_big_and_bridging) / len(big_influencers)
print(f"{len(both_big_and_bridging):,} of {len(big_influencers):,} "
      f"big influencers are bridging users "
      f"({rate_big_infl_bridging:.2%}).")


53,572 of 87,706 big influencers are bridging users (61.08%).


In [108]:
all_users       = set(df['From-User-Id'].unique())
non_big         = all_users - big_influencers

# bridging rate among non‑big users
rate_non_big_bridging = len(bridgers & non_big) / len(non_big)

print(f"Bridging rate   – big  influencers: {rate_big_infl_bridging:.2%}")
print(f"Bridging rate   – non‑big users   : {rate_non_big_bridging:.2%}")


Bridging rate   – big  influencers: 61.08%
Bridging rate   – non‑big users   : 6.56%


In [109]:
big_influencers = set(top_percentile_users.index)
all_users = set(total_edge_counts.index)
non_big_users = all_users - big_influencers

big_user_edges = total_edge_counts.loc[list(big_influencers)]
non_big_user_edges = total_edge_counts.loc[list(non_big_users)]

avg_edges_big = big_user_edges.mean()
avg_edges_non_big = non_big_user_edges.mean()

median_edges_big = big_user_edges.median()
median_edges_non_big = non_big_user_edges.median()

# Print results
print(f"Average edge count – big influencers   : {avg_edges_big:.2f}")
print(f"Average edge count – non‑big users     : {avg_edges_non_big:.2f}")

print(f"Median edge count  - big influencers   : {median_edges_big:.2f}")
print(f"Median edge count  - non-big users     : {median_edges_non_big:.2f}")

Average edge count – big influencers   : 43.88
Average edge count – non‑big users     : 2.04
Median edge count  - big influencers   : 17.00
Median edge count  - non-big users     : 1.00
