In [47]:
import numpy as np
import json
import gzip
import glob
import os
import random
import pandas as pd

# data 
path = '../../../../data'
twitter_path= f'{path}/twitter'

# Examine topic distribution in seed tweets

In [9]:
seed_files = f'{twitter_path}/raw/seed_summaries'

topic_counts = dict()
tweet_count = 0
topic_convos = dict()
convo_topics = dict()

for filename in glob.glob(f'{seed_files}/*'):
    df = pd.read_csv(filename, 
                     sep='\t',
                     dtype={'tweet_id':str,
                            'conversation_id':str})
    # conversations
    for i, row in df.iterrows():
        topics = row['topic'].split('|')
        cid = row['conversation_id']
        
        for topic in topics:
            topic = topic.strip()
            
            # add to topic_convos
            topic_convos.setdefault(topic, set())
            topic_convos[topic].add(cid)
            
            # add to convo_topics
            convo_topics.setdefault(cid, set())
            convo_topics[cid].add(topic)
            
    # tweets
    tweet_count += len(df)
    
    counts = dict(df['topic'].value_counts())
    
    for key, value in counts.items():
        terms = key.split('|')

        for term in terms:
            term = term.strip()
            topic_counts.setdefault(term, 0)
            topic_counts[term] += value

# Tweets
print(f'{tweet_count} total seed tweets.')

for topic, count in topic_counts.items():
    print(f'  {topic}: {count} ({count/tweet_count:.2%}) tweets') 
    
# Conversations
total_convos = len(convo_topics)
print(f'\n{total_convos} unique conversation:')

for topic, cids in topic_convos.items():
    print(f'  {topic}: {len(cids)} ({len(cids)/total_convos :.2%}) conversations')

19364162 total seed tweets.
  russia: 13868950 (71.62%) tweets
  midterms: 5778350 (29.84%) tweets
  childcare: 796982 (4.12%) tweets

17394543 unique conversation:
  midterms: 5315288 (30.56%) conversations
  russia: 12318476 (70.82%) conversations
  childcare: 764480 (4.39%) conversations


In [10]:
# write to file
outfile = f'{twitter_path}/reference/twitter_seed_topic_convo.json'

topic_convos_json = dict((topic, list(cids)) for topic, cids in topic_convos.items())
print('Writing to topic-to-convo dict to file...')

with open(outfile, 'w') as fp:
    fp.write(json.dumps(topic_convos_json))
    
print('\tTopic-to-convo dict written to file')

outfile = f'{twitter_path}/reference/twitter_seed_convo_topic.json'

convos_topic_json = dict((cid, list(topics)) for cid, topics in convo_topics.items())
print('Writing convo-to-topic dict to file...')

with open(outfile, 'w') as fp:
    fp.write(json.dumps(convos_topic_json))
    
print('\tConvo-to-topic dict written to file')

Writing to topic-to-convo dict to file...
	Topic-to-convo dict written to file
Writing convo-to-topic dict to file...
	Convo-to-topic dict written to file


# Check conversations searched with errors

In [19]:
no_convo_path = f'{twitter_path}/raw/conversation_errors'
no_convos_file = f'{twitter_path}/reference/no_convos.txt'

for i, filename in enumerate(glob.glob(f'{no_convo_path}/*')):
    convo_id = filename.split('/')[-1].split('_')[0]
    
    try:
        with gzip.open(filename, 'r') as fp:
            data = json.loads(fp.read())
            
        if 'meta' in data:
            if data['meta']['result_count'] == 0:
                with open(no_convos_file, 'a') as fp:
                    fp.write(convo_id + '\n')
                    
    except:
        pass

In [21]:
no_convos = set()

with open(no_convos_file, 'r') as fp:
    for line in fp.readlines():
        convo_id = line.strip()
        no_convos.add(convo_id)
        
print(f'{len(no_convos)} conversation ids returned no conversations')

# check distribution by topic
for topic, cids in topic_convos.items():
    topic_no_convo = no_convos.intersection(cids)

    print(f'  {topic}: {len(topic_no_convo)} ({len(topic_no_convo)/len(no_convos) :.2%}) conversations')

1362875 conversation ids returned no conversations
  midterms: 172929 (12.69%) conversations
  russia: 584404 (42.88%) conversations
  childcare: 25726 (1.89%) conversations


# Check collected conversations

In [34]:
collected_convo_path = f'{twitter_path}/conversation_tables'

collected = set()

for filename in glob.glob(f'{collected_convo_path}/*'):
    conversation_id = filename.split('/')[-1].split('.')[0]
    collected.add(conversation_id)
    
print(f'{len(collected)} conversations collected.')

# check distribution by topic
for topic, cids in topic_convos.items():
    topic_collected = collected.intersection(cids)

    print(f'  {topic}: {len(topic_collected)} ({len(topic_collected)/len(collected) :.2%}) conversations')

113966 conversations collected.
  midterms: 23972 (21.03%) conversations
  russia: 51720 (45.38%) conversations
  childcare: 3086 (2.71%) conversations


# Full breakdown of search by topic

In [43]:
remaining_dict = dict()

print(f'{total_convos} conversation in seed tweets.\n')

for topic, convo_list in topic_convos.items():
    print(f'**** {topic} ****')
    
    n = len(convo_list)
    
    print(f'   Total conversations: {len(convo_list)}')
    
    # conversations found
    overlap_data = convo_list.intersection(collected)
    
    # conversations searched but not found
    overlap_no_data = convo_list.intersection(no_convos)
    
    print(f'   Found with data {len(overlap_data)} ({len(overlap_data)/n :.2%})')
    print(f'   Found with no data {len(overlap_no_data)} ({len(overlap_no_data)/n :.2%})')
    
    total = overlap_data.union(overlap_no_data)
    print(f'   {len(total)} ({len(total)/n:.2%}) conversations already searched.')
    
    remaining = convo_list.difference(total)
    print(f'   {len(remaining)} ({len(remaining)/n:.2%}) conversations remaining to be searched for topic {topic}')
    
    # save ids to search
    remaining_dict[topic] = list(remaining)

17394543 conversation in seed tweets.

**** midterms ****
   Total conversations: 5315288
   Found with data 23972 (0.45%)
   Found with no data 172929 (3.25%)
   196894 (3.70%) conversations already searched.
   5118394 (96.30%) conversations remaining to be searched for topic midterms
**** russia ****
   Total conversations: 12318476
   Found with data 51720 (0.42%)
   Found with no data 584404 (4.74%)
   636112 (5.16%) conversations already searched.
   11682364 (94.84%) conversations remaining to be searched for topic russia
**** childcare ****
   Total conversations: 764480
   Found with data 3086 (0.40%)
   Found with no data 25726 (3.37%)
   28809 (3.77%) conversations already searched.
   735671 (96.23%) conversations remaining to be searched for topic childcare


In [45]:
# save to file
with open(f'{twitter_path}/reference/convos_not_searched.json', 'w') as fp:
    fp.write(json.dumps(remaining_dict))
    
print(f'Conversation ids which have not been searched written to file')

Conversation ids which have not been searched written to file


# Sample additional conversations to search

In [50]:
search = set()

for topic, convo_list in topic_convos.items():
    
    # ideal final sample size
    final_sample_size = int(len(convo_list) * 0.05) + 1
    
    #### searched conversations
    # conversations found
    overlap_data = convo_list.intersection(collected)
    
    # conversations searched but not found
    overlap_no_data = convo_list.intersection(no_convos)
    
    searched = len(overlap_data.union(overlap_no_data))
    
    # remaining sample needed
    sample_size = final_sample_size - searched 

    if sample_size > 0:
        # set of unsearched convos (we will sample from here)
        remaining = remaining_dict[topic]

        sample = random.sample(remaining, sample_size)

        search.update(set(sample))

        print(f'Searching {len(sample)} conversations for {topic}')
    else:
        print(f'No additional conversations needed for {topic}')

print(f'\nSearching {len(search)} conversations overall.')

# write to file
with open(f'{twitter_path}/reference/twitter_convos_to_search.txt', 'w') as fp:
    for cid in search:
        fp.write(cid + '\n')
        
print('Conversation ids written to file.')

Searching 68871 conversations for midterms
No additional conversations needed for russia
Searching 9416 conversations for childcare

Searching 78279 conversations overall.
Conversation ids written to file.
