In [1]:
import numpy as np
import pandas as pd
import json
import gzip
import glob
import os
import pandas as pd
import stat_utils as su

In [2]:
# data 
path = '../../data'
twitter_path= f'{path}/Twitter'
reddit_path= f'{path}/Reddit'

topics = ['midterms', 'russia', 'childcare']

# Data for table A3: Initital data collection and conversation retrieval

Note that this notebook only goes through the Reddit numbers as the Twitter numbers in this table can be found in `code/data_cleaning/twitter_sample_convos.ipynb'

# Reddit

### Reddit seed comments

In [3]:
# load conversation
df = pd.read_csv(f'{reddit_path}/seed_comments.txt',sep='\t', index_col=0)

# load topic based on keyword
term_to_topic = su.load_keywords(verbose=False)

# add topic column
df['topic'] = df['keyword'].apply(lambda x: term_to_topic[x])

r_total = 0

print('Estimated topics is seed posts:')
for topic in topics:
    count = list(df['topic']).count(topic)
    r_total += count
    print(f'  {topic}: {count}')

print(f'{r_total} seed Reddit comments')

Estimated topics is seed posts:
  midterms: 239275
  russia: 199917
  childcare: 284146
723338 seed Reddit comments


## Reddit Searched

In [5]:
# not retrieved
unretriviable_file = f'{reddit_path}/unretrievable_threads.txt'

unfound = pd.read_csv(unretriviable_file, sep='\t', names=['thread_id'])

# 23510
print(f'{len(unfound)} Reddit conversations searched but not retrieved')

23510 Reddit conversations searched but not retrieved


In [6]:
# retrieved
reddit_convo_path = f'{reddit_path}/conversation_tables'

found = set()

for filename in glob.glob(f'{reddit_convo_path}/*'):
    cid = filename.split('/')[-1].split('.')[0]
    found.add(cid)

# 118309
print(f'{len(found)} reddit conversations retrieved')

118309 reddit conversations retrieved


In [7]:
r_searched = found.union(set(unfound['thread_id']))

# 137274
print(f'{len(r_searched)} reddit conversations searched')

137274 reddit conversations searched


In [8]:
# subset full df to those convos searched
sub = df[df['thread_id'].isin(r_searched)]
sub = sub.drop_duplicates(subset='thread_id')

print('Reddit conversations searched')
for topic in topics:
    search_count = list(sub['topic']).count(topic)
    seed_count =  list(df['topic']).count(topic)
    
    print(f'   {topic}: {search_count} ({search_count/seed_count:.2%} of seeds)')
          
print(f'{len(r_searched)} total converations searched')
print(f'{len(r_searched)/len(set(df.thread_id)):.2%} of seed Reddit conversations searched')

Reddit conversations searched
   midterms: 47661 (19.92% of seeds)
   russia: 18243 (9.13% of seeds)
   childcare: 71369 (25.12% of seeds)
137274 total converations searched
39.25% of seed Reddit conversations searched


## Reddit Retrieved

In [9]:
# subset full df to those convos retrieved
sub2 = df[df['thread_id'].isin(found)]
sub2 = sub2.drop_duplicates(subset='thread_id')

r_found = len(found)

print('Reddit conversations retrieved')
for topic in topics:
    search_count = list(sub['topic']).count(topic)
    found_count = list(sub2['topic']).count(topic)
    
    print(f'   {topic}: {found_count} ({found_count/search_count:.2%} of searched)')

          
print(f'{r_found} total Reddit converations retrieved')
print(f'{r_found/len(r_searched):.2%} of searched Reddit conversations retrieved')

Reddit conversations retrieved
   midterms: 38054 (79.84% of searched)
   russia: 14143 (77.53% of searched)
   childcare: 66111 (92.63% of searched)
118309 total Reddit converations retrieved
86.18% of searched Reddit conversations retrieved


# Twitter

### Twitter seed posts

In [10]:
seed_path = f'{twitter_path}/seed_comments'

counts = dict()
counts.setdefault('total', 0)

for filename in glob.glob(f'{seed_path}/*'):
    df = pd.read_csv(filename, sep='\t')
    counts['total'] += len(df)
    
    for topic in topics:
        counts.setdefault(topic, 0)
        count = len(df[df['topic'].str.contains(topic)])
        counts[topic] += count

print('Estimated topics is seed posts:')

for topic in topics:
    print(f'  {topic}: {counts[topic]}')
    
print(f'{counts["total"]} seed Twitter comments')

Estimated topics is seed posts:
  midterms: 5778350
  russia: 13868950
  childcare: 796982
19364162 seed Twitter comments


In [12]:
print('Total seed posts across platforms')
print(r_total + counts['total'])

Total seed posts across platforms
20087500


In [13]:
# topic associated with seed convo based on keywords
# dict of {topic : [convo_ids]}

topic_convos_file =  f'{twitter_path}/twitter_seed_topic_convo.json'

with open(topic_convos_file, 'r') as fp:
    topic_convos = json.loads(fp.read())

## Twitter searched

In [14]:
# Searched but no convo
no_convo_file = f'{twitter_path}/no_convos.txt'

no_convo = pd.read_csv(no_convo_file, sep='\t', names=['thread_id'], dtype='str')
no_convo = set(no_convo['thread_id'])

# 1362875
print(f'{len(no_convo)} Twitter conversations searched but no conversation resulted')

1362875 Twitter conversations searched but no conversation resulted


In [15]:
# retrieved
twitter_convo_path = f'{twitter_path}/conversation_tables'

collected = set()

for filename in glob.glob(f'{twitter_convo_path}/*'):
    cid = filename.split('/')[-1].split('.')[0]
    collected.add(cid)

In [16]:
searched = no_convo.union(collected)

total = 0

for topic, cids in topic_convos.items():
    topic_searched = searched.intersection(cids)
    total += len(topic_searched)

    print(f'  {topic}: {len(topic_searched)} ({len(topic_searched)/len(searched) :.2%}) conversations')

print(f'{total} Twitter conversations searched')

  midterms: 196894 (13.33%) conversations
  russia: 636112 (43.07%) conversations
  childcare: 28809 (1.95%) conversations
861815 Twitter conversations searched


In [17]:
# 999089
print('Total conversations searched across platforms')
print(len(r_searched) + total)

Total conversations searched across platforms
999089


## Twitter retrieved

In [18]:
t_found = 0

for topic, cids in topic_convos.items():
    topic_collected = collected.intersection(cids)
    t_found += len(topic_collected)

    print(f'  {topic}: {len(topic_collected)} ({len(topic_collected)/len(collected) :.2%}) conversations')

print(f'{t_found} Twitter conversations retrieved')
print(f'{t_found/total:.2%} of searched Twitter conversations retrieved')

  midterms: 23972 (21.03%) conversations
  russia: 51720 (45.38%) conversations
  childcare: 3086 (2.71%) conversations
78778 Twitter conversations retrieved
9.14% of searched Twitter conversations retrieved


In [19]:
# 197087
print('Total conversations retrieved across platforms')
print(r_found + t_found)

Total conversations retrieved across platforms
197087


# Handcoded posts

In [20]:
labeled_corpus_file = f'{path}/labeled_corpus.txt'

In [21]:
df = pd.read_csv(labeled_corpus_file, sep='\t',
                dtype={'pid':str, 'final_topic':str, 'clean_text':str})

# 508695
print(f'{len(df)} posts coded')

508695 posts coded


### Reddit

In [23]:
reddit_corpus_file = f'{reddit_path}/all_reddit_text.txt'

In [40]:
reddit = pd.read_csv(reddit_corpus_file, sep='\t')
print(len(reddit))

32786519


In [26]:
reddit_coded = df[['pid', 'final_topic']].merge(reddit[['pid', 'conversation_id']], on='pid', how='inner')

In [27]:
for topic in set(reddit_coded['final_topic']):
    sub = reddit_coded[reddit_coded['final_topic']==topic]
    
    print(f'****{topic} (Reddit) ****')
    print(f'  {len(sub)} total comments')
    print(f'  {len(set(sub.conversation_id))} total conversations')

****US midterm elections (Reddit) ****
  690 total comments
  23 total conversations
****Childcare/parenting (Reddit) ****
  225 total comments
  23 total conversations
****Russia/Ukraine war (Reddit) ****
  1860 total comments
  38 total conversations
****Off_topic (Reddit) ****
  36953 total comments
  293 total conversations


### Twitter

In [41]:
twitter_corpus_file = f'{twitter_path}/all_twitter_text.txt'

twitter = pd.read_csv(twitter_corpus_file, sep='\t',
                     dtype={'pid':str, 'conversation_id':str})
# 21962965
print(len(twitter))

21962965


In [29]:
twitter_coded = df[['pid', 'final_topic']].merge(twitter[['pid', 'conversation_id']], on='pid', how='inner')

In [30]:
for topic in set(twitter_coded['final_topic']):
    sub = twitter_coded[twitter_coded['final_topic']==topic]
    
    print(f'****{topic} (Twitter) ****')
    print(f'  {len(sub)} total comments')
    print(f'  {len(set(sub.conversation_id))} total conversations')

****US midterm elections (Twitter) ****
  12506 total comments
  20 total conversations
****Childcare/parenting (Twitter) ****
  3136 total comments
  23 total conversations
****Russia/Ukraine war (Twitter) ****
  22189 total comments
  15 total conversations
****Off_topic (Twitter) ****
  431152 total comments
  207 total conversations
