# Echo chambers

Find groups of users which often send out the same tweets

In [2]:
import os
import pandas as pd
import re
import sys

from IPython.display import clear_output

In [3]:
DATADIR = "../data/text/"

In [5]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

def remove_url_suffix(tweet):
    return(re.sub("\s*https?://.*$","",tweet,flags=re.IGNORECASE))

def remove_rt_prefix(tweet):
    return(re.sub("^RT[^:]*:\s*","",tweet))

def read_tweets(file_pattern):
    files = os.listdir(DATADIR)
    users_per_tweet = {}
    tweets_per_user = {}
    nbr_of_tweets = 0
    for file_name in files:
        if re.search(file_pattern,file_name):
            df = pd.read_csv(DATADIR+file_name)
            nbr_of_tweets += len(df)
            for i in range(0,len(df)):
                user = df.iloc[i]["user"]
                text = remove_rt_prefix(remove_url_suffix(df.iloc[i]["text"])).strip()
                if not text in users_per_tweet: users_per_tweet[text] = [user]
                elif not user in users_per_tweet[text]: users_per_tweet[text].append(user)
                if not user in tweets_per_user: tweets_per_user[user] = 1
                else: tweets_per_user[user] += 1
    return(users_per_tweet,tweets_per_user,nbr_of_tweets)

In [6]:
def get_user_pairs(users_per_tweet):
    user_pairs = {}
    for user_group in users_per_tweet.values():
        for i in range(1,len(user_group)):
            for j in range(i+1,len(user_group)):
                pair = " ".join([user_group[i],user_group[j]])
                if not pair in user_pairs: user_pairs[pair] = 1
                else: user_pairs[pair] += 1
    return(user_pairs)

In [7]:
def show_top_user_pairs(user_pairs):
    for pair in sorted(user_pairs.keys(),key=lambda p:user_pairs[p],reverse=True)[:20]:
        print(f"{user_pairs[pair]} {pair}")
    return()

In [8]:
THRESHOLD = 10

def make_user_groups(user_pairs):
    user_groups = {}
    for pair in user_pairs:
        if user_pairs[pair] >= THRESHOLD:
            user1,user2 = pair.split()
            if not user1 in user_groups and not user2 in user_groups:
                user_groups[user1] = (user1,user2)
                user_groups[user2] = (user1,user2)
            elif user1 in user_groups and not user2 in user_groups:
                user_groups[user1] = tuple(set(user_groups[user1]+(user2,)))
                for e in user_groups[user1]: user_groups[e] = user_groups[user1]
                user_groups[user2] = user_groups[user1]
            elif not user1 in user_groups and user2 in user_groups:
                user_groups[user2] = tuple(set(user_groups[user2]+(user1,)))
                for e in user_groups[user2]: user_groups[e] = user_groups[user2]
                user_groups[user1] = user_groups[user2]
            else:
                user_groups[user1] = tuple(set(user_groups[user1]+user_groups[user2]))
                for e in user_groups[user1]: user_groups[e] = user_groups[user1]
                for e in user_groups[user2]: user_groups[e] = user_groups[user1]
    return(user_groups)

In [9]:
def show_user_groups(user_groups):
    seen = {}
    for user in user_groups:
        if not user in seen:
            print(len(user_groups[user]),[u for u in sorted(user_groups[user],key=lambda u:u.lower())])
            for u in user_groups[user]: seen[u] = True

In [10]:
def get_largest_group(user_groups):
    largest_group = []
    for user in user_groups:
        if len(user_groups[user]) > len(largest_group): 
            largest_group = user_groups[user]
    return(largest_group)

def compute_overlap(group1,group2):
    return(len(set(group1) & set(group2)))

def get_nbr_of_tweets(user_group,tweets_per_user):
    nbr_of_tweets = 0
    for user in user_group: nbr_of_tweets += tweets_per_user[user]
    return(nbr_of_tweets)

In [None]:
MONTH = "202010"

previous_user_group = []
for day in range(1,32):
    date = MONTH+str(day).zfill(2)
    users_per_tweet,tweets_per_user,nbr_of_tweets = read_tweets(date)
    user_pairs = get_user_pairs(users_per_tweet)
    user_groups = make_user_groups(user_pairs)
    largest_user_group = get_largest_group(user_groups)
    largest_user_group_tweet_count = get_nbr_of_tweets(largest_user_group,tweets_per_user)
    if len(previous_user_group) > 0:
        print(date,len(largest_user_group),compute_overlap(previous_user_group,largest_user_group),
              largest_user_group_tweet_count,nbr_of_tweets)
    previous_user_group = largest_user_group

20201002 393 245 38794 4660
20201003 418 225 38764 40105
20201004 383 233 38516 38335
20201005 480 258 49555 46327
20201006 356 243 40521 44316