From these 2000 tweets found in a generic search, I looked at the most frequently appearing user IDs on the assumption that these were people who were in some way or another connected with the HIV/AIDS Twitter community, and therefore would be most in touch with the important keywords, hashtags, etc. I collected the on-topic tweets from these users for my keyword-findig corpus.

#### import the things

In [4]:
from datetime import datetime
import pandas as pd
import json
import os
import numpy as np
import glob2
from collections import Counter

import data

In [3]:
root_dir = os.path.join(data.__path__[0], "raw")

#### import jsons and extract data from tweets from step 1

In [6]:
json_files = glob2.glob(os.path.join(root_dir, '*30day*'))

In [13]:
jsons = []
for file in json_files:
    with open(file) as f:
        j = json.load(f)
        jsons.append(j)

In [18]:
for j in jsons:
    print(len(j))

501
501
501
501


In [21]:

users_30day_dict = {}
k = 0
for n, jd in enumerate(jsons):
    for key in jd.keys():
        temp = {}
        if 'query' not in key:
            temp['user_id'] = jd[key]['user']['id']
            temp['follower_count'] = jd[key]['user']['followers_count']
            temp['verified'] = jd[key]['user']['verified']
            temp['num_tweets'] = jd[key]['user']['statuses_count']
            users_30day_dict[k] = temp
            k += 1

In [22]:
len(users_30day_dict)

2000

#### create data frame with some general user data

In [23]:
user_30day_df = pd.DataFrame.from_dict(users_30day_dict).T
user_30day_df.head()

Unnamed: 0,user_id,follower_count,verified,num_tweets
0,604651118,740,False,26077
1,1028001153517268994,1113,False,1909
2,1086461753599262720,344,False,935
3,2651353567,443,False,318
4,216750349,5323,False,427794


In [24]:
user_30day_df['follower_count'] = user_30day_df['follower_count'].astype(int, errors='ignore')
user_30day_df['verified'] = user_30day_df['verified'].astype(bool, errors='ignore')
user_30day_df['num_tweets'] = user_30day_df['num_tweets'].astype(int, errors='ignore')

In [25]:
user_30day_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   user_id         2000 non-null   object
 1   follower_count  2000 non-null   int32 
 2   verified        2000 non-null   bool  
 3   num_tweets      2000 non-null   int32 
dtypes: bool(1), int32(2), object(1)
memory usage: 48.8+ KB


#### extract the frequent posters

In [35]:
user_counts = user_30day_df['user_id'].value_counts()
user_counts[:20]

1055712964039168002    31
116559291              27
355696940              17
263981499              16
1372554174             14
1057379674022727680    13
715280940675817472     13
28794698               12
1346984867356172292    12
16638543               12
1169994069940080641    11
35814608               10
39020280               10
931657800              10
1258092054510555136    10
148477475              10
1010757472238342145     8
125752710               8
171923367               8
25105436                8
Name: user_id, dtype: int64

In [60]:
high_posters = list(user_counts.iloc[np.where(user_counts >=10)].index)
high_posters = [str(hp) for hp in high_posters]
high_posters

['1055712964039168002',
 '116559291',
 '355696940',
 '263981499',
 '1372554174',
 '1057379674022727680',
 '715280940675817472',
 '28794698',
 '1346984867356172292',
 '16638543',
 '1169994069940080641',
 '35814608',
 '39020280',
 '931657800',
 '1258092054510555136',
 '148477475']

#### extract the tweets made by the high posters & clean up the data a bit

In [72]:
tweets_30day_dict = {}
k = 0
for n, jd in enumerate(jsons):
    for key in jd.keys():
        temp = {}
        if 'query' not in key:
            if (str(jd[key]['user']['id']) in high_posters)==True:
                temp['user_id'] = jd[key]['user']['id']
                temp['tweet'] = jd[key]['text']
                temp['tweet_id'] = jd[key]['id']
                tweets_30day_dict[k] = temp
                k += 1

In [78]:
user_counts.iloc[np.where(user_counts >=10)].sum()

228

In [79]:
len(tweets_30day_dict.keys())

228

In [81]:
tweets_30day_df = pd.DataFrame(tweets_30day_dict).T
tweets_30day_df.head()

Unnamed: 0,user_id,tweet,tweet_id
0,715280940675817472,RT @BATLabUNC: #DYK that #theSouth accounts fo...,1360007586179915787
1,931657800,RT @QueerCultureIre: If you enjoyed our #LGBTH...,1359993487689719810
2,715280940675817472,RT @CDC_HIVAIDS: CDC's updated fact sheet summ...,1359991518803730435
3,1372554174,RT @achievetgthrtx: With #Valentines right aro...,1359989178440822785
4,1055712964039168002,Meth Overdose Deaths Rise Rapidly Across All R...,1359985644416753669


In [85]:
tweets_30day_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228 entries, 0 to 227
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   228 non-null    object
 1   tweet     228 non-null    object
 2   tweet_id  228 non-null    object
dtypes: object(3)
memory usage: 7.1+ KB


In [83]:
len(tweets_30day_df.tweet_id.unique())

152

In [91]:
tweets_30day_df[tweets_30day_df.duplicated(subset='tweet_id', keep=False)==True].sort_values('tweet_id')

Unnamed: 0,user_id,tweet,tweet_id
116,1055712964039168002,Uganda: Ugandan Wins International Award in HI...,1359555239586779138
192,1055712964039168002,Uganda: Ugandan Wins International Award in HI...,1359555239586779138
191,1055712964039168002,Cone-shaped HIV-1 capsids are transported thro...,1359562781960372226
115,1055712964039168002,Cone-shaped HIV-1 capsids are transported thro...,1359562781960372226
114,1258092054510555136,Learning you’re living with #HIV can be overw...,1359564036115816449
...,...,...,...
119,39020280,RT @TakuMukiwa: Follow @LuckmorePamhid1 for a ...,1359627688160489484
42,39020280,🤣OMG so much about this is funny. Thank you fa...,1359628911542489089
118,39020280,🤣OMG so much about this is funny. Thank you fa...,1359628911542489089
117,1346984867356172292,@MassDPH #DisabilityHasNoAgeRequirement \nBunc...,1359643355374845954


In [92]:
tweets_30day_df.drop_duplicates(subset=['tweet_id', 'tweet'], keep='first', inplace=True, ignore_index=False)

In [95]:
tweets_30day_df[tweets_30day_df.duplicated(subset='tweet_id', keep=False)==True]

Unnamed: 0,user_id,tweet,tweet_id


In [97]:
tweets_30day_df.sort_values('user_id')

Unnamed: 0,user_id,tweet,tweet_id
103,16638543,RT @TheGraceProjec2: Did you know in 2019 Blac...,1359578312167809037
51,16638543,SisterLove is planning a week of programming t...,1359615589917032454
52,16638543,Black #Trans Women are Women! We must recogniz...,1359614991742164992
54,16638543,We must recognize the communities we serve not...,1359613245041102853
98,16638543,RT @SisterSong_WOC: A2) We #CelebrateBlackWome...,1359580156277448713
...,...,...,...
92,1346984867356172292,RT @LupieLady08: @MassGov #VaccinateTheMostVul...,1359581561566412810
91,1346984867356172292,@MassGovernor VaccinateTheMostVulnerable \n#Va...,1359581974264963083
89,1346984867356172292,@bostonherald Yet the sickest in this dumpster...,1359582886211551232
84,1346984867356172292,@MassGovernor You’ve COMPLETELY IGNORED THE #D...,1359584625807523853


#### extract just the text of the tweets

In [99]:
tweets = []
for row in tweets_30day_df.index:
    tweets.append(tweets_30day_df.at[row, 'tweet'])

In [102]:
tweet_file = os.path.join(root_dir, "high_count_user_tweets.json")
with open(tweet_file, 'w') as f:
    json.dump(tweets, f)