Using the on-topic tweets from my high-volume users, I performed a series of basic processing steps to extract the most commonly used words and hashtags. These are the terms I used in my search to obtain my actual data. 

#### import the things

In [1]:

import pandas as pd
import json
import os
import numpy as np
import glob2
import data

import spacy
import re
import en_core_web_trf
import en_core_web_md
from collections import Counter

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_md") 
# nlp = en_core_web_trf.load()

#### get common words, hashtags, mentions

In [3]:
#load tweets saved in previous step
fl = os.path.join(data.__path__[0], r"raw\high_count_user_tweets.json")
with open(fl, 'r') as f:
    tweets = json.load(f)

In [4]:
tweets_str = " ".join(tweets)

In [5]:
custom_stop_words = ['rt']
for word in custom_stop_words:
    STOP_WORDS.add(word)

In [6]:
def preprocess_tweets(tweet_str):
    """takes a 1d list of strings
    uses some code from datacamp's NLP course""" 
    t_lower = tweet_str.lower().strip() #lowercase everything
    hashtags = re.findall(r"#(\w+)", t_lower) #pull out hashtags
    mentions = re.findall(r"@(\w+)", t_lower) #pull out mentions
    t_words = [word for word in t_lower.split(' ') if word.isalpha()]
    t_str =  " ".join(t_words)

#     [t_lower.remove(ht) for ht in hashtags]
#     [t_lower.remove(mt) for mt in mentions]  
    
    t_doc = nlp(t_str) #create nlp object
    t_lems = [t.lemma_ for t in t_doc] #get lemmas
    t_clean = [t_lem for t_lem in t_lems if t_lem not in STOP_WORDS] #remove stop words, non-alpha words     

    
#     if custom_stops is not None:
#         [t_clean.remove(stop) for stop in custom_stops if stop in t_clean]
  
    return t_doc, t_clean, hashtags, mentions

In [7]:
t_doc, t_clean, hashtags, mentions = preprocess_tweets(tweets_str)

In [8]:
h_count = Counter(hashtags)
h_count.most_common(5)

[('hiv', 77),
 ('plhiv', 16),
 ('endhivstigma', 14),
 ('celebrateblackwomen', 13),
 ('trans', 9)]

In [9]:
m_count = Counter(mentions)
m_count.most_common(5)

[('blackaids', 10),
 ('uspwn', 9),
 ('sexhistorian', 8),
 ('cdc_hivaids', 5),
 ('hspn4', 4)]

In [10]:
w_count = Counter(t_clean)
w_count.most_common(10)

[('black', 39),
 ('woman', 26),
 ('hiv', 21),
 ('live', 13),
 ('people', 11),
 ('new', 8),
 ('resource', 8),
 ('movement', 8),
 ('work', 7),
 ('right', 7)]

In [11]:
clean_doc = nlp(" ".join(t_clean))

In [12]:
pos = [(w.text, w.pos_) for w in clean_doc]

In [13]:
pos_count = Counter([p[1] for p in pos])

In [14]:
pos_count

Counter({'VERB': 162,
         'ADJ': 163,
         'NOUN': 491,
         'ADV': 30,
         'INTJ': 7,
         'X': 9,
         'NUM': 1,
         'PRON': 7,
         'PROPN': 3,
         'AUX': 3,
         'ADP': 2,
         'DET': 1})

#### load data from high posters to include interaction stats, clean up data a bit

In [15]:
data_fol = os.path.join(data.__path__[0],"raw")

jsons = glob2.glob(os.path.join(data_fol, "*30day*"))

tweets_dict = {}
for n, file in enumerate(jsons):
    with open(file) as f:
        j = json.load(f)
        for k in j.keys():
            key = "{}{}".format(n, k)
            tweets_dict[key] = j[k]

In [16]:
qu = []
for key in tweets_dict.keys():
    if 'query' in key:
        qu.append(key)
        
for k in qu:
    del tweets_dict[k]

In [18]:
tweets_dict['00'].keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'entities', 'favorited', 'retweeted', 'filter_level', 'lang', 'matching_rules'])

In [19]:
tweets_df = pd.DataFrame(index=range(len(tweets_dict.keys())))

for n, key in enumerate(tweets_dict.keys()):
    tweets_df.at[n, 't_id'] = tweets_dict[key]['id']
    tweets_df.at[n, 'u_id'] = tweets_dict[key]['user']['id']
    tweets_df.at[n, 'u_name'] = tweets_dict[key]['user']['screen_name']
    tweets_df.at[n, 'n_followers'] = tweets_dict[key]['user']['followers_count']
    tweets_df.at[n, 'verified'] = tweets_dict[key]['user']['verified']
    tweets_df.at[n, 'datetime'] = tweets_dict[key]['created_at']
    tweets_df.at[n, 'n_replies'] = tweets_dict[key]['reply_count']
    tweets_df.at[n, 'n_rts'] = tweets_dict[key]['retweet_count']
    tweets_df.at[n, 'n_faves'] = tweets_dict[key]['favorite_count']
    tweets_df.at[n, 'text'] = tweets_dict[key]['text']

In [20]:
tweets_df.head()

Unnamed: 0,t_id,u_id,u_name,n_followers,verified,datetime,n_replies,n_rts,n_faves,text
0,1.360015e+18,604651100.0,CindersJj,740.0,False,Thu Feb 11 23:56:41 +0000 2021,0.0,0.0,0.0,RT @EarlOfSidmouth: Everyone who’s ended up wi...
1,1.360015e+18,1.028001e+18,channelchek,1113.0,False,Thu Feb 11 23:56:40 +0000 2021,0.0,0.0,0.0,Artificial Intelligence used in machine learni...
2,1.360014e+18,1.086462e+18,retrovi_fighter,344.0,False,Thu Feb 11 23:54:27 +0000 2021,0.0,0.0,0.0,RT @CDC_HIVAIDS: CDC's updated fact sheet summ...
3,1.360014e+18,2651354000.0,ltmatthews,443.0,False,Thu Feb 11 23:54:04 +0000 2021,2.0,1.0,22.0,Congratulations 🎉 @pchitneni on her @HarvardCF...
4,1.360013e+18,216750300.0,Jpofgwynedd,5323.0,False,Thu Feb 11 23:50:04 +0000 2021,0.0,0.0,0.0,RT @EarlOfSidmouth: Everyone who’s ended up wi...


In [21]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   t_id         2000 non-null   float64
 1   u_id         2000 non-null   float64
 2   u_name       2000 non-null   object 
 3   n_followers  2000 non-null   float64
 4   verified     2000 non-null   object 
 5   datetime     2000 non-null   object 
 6   n_replies    2000 non-null   float64
 7   n_rts        2000 non-null   float64
 8   n_faves      2000 non-null   float64
 9   text         2000 non-null   object 
dtypes: float64(6), object(4)
memory usage: 156.4+ KB


In [23]:
num_cols = ['n_followers', 'n_replies', 'n_rts', 'n_faves',]
for col in num_cols:
    tweets_df[col] = tweets_df[col].astype(int)

for col in [ 't_id', 'u_id']:
    tweets_df[col] = tweets_df[col].astype(float)
    
tweets_df['verified'] = tweets_df['verified'].astype('bool')
tweets_df['datetime'] = pd.to_datetime(tweets_df['datetime'])

In [24]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   t_id         2000 non-null   float64            
 1   u_id         2000 non-null   float64            
 2   u_name       2000 non-null   object             
 3   n_followers  2000 non-null   int32              
 4   verified     2000 non-null   bool               
 5   datetime     2000 non-null   datetime64[ns, UTC]
 6   n_replies    2000 non-null   int32              
 7   n_rts        2000 non-null   int32              
 8   n_faves      2000 non-null   int32              
 9   text         2000 non-null   object             
dtypes: bool(1), datetime64[ns, UTC](1), float64(2), int32(4), object(2)
memory usage: 111.5+ KB


In [28]:
tweets_df.drop_duplicates(subset=['t_id', 'text'], keep='first', inplace=True, ignore_index=False)

#### sort data by different interaction types and find common terms

In [27]:
tweets_df.sort_values(['n_replies', 'n_rts', 'n_faves'], ascending=False)

Unnamed: 0,t_id,u_id,u_name,n_followers,verified,datetime,n_replies,n_rts,n_faves,text
1640,1.359248e+18,1.300349e+08,DrRanj,142444,True,2021-02-09 21:09:37+00:00,15,69,1675,WOW! Our @thismorning item on #HIV testing tod...
62,1.359996e+18,9.122911e+07,EarlOfSidmouth,2261,False,2021-02-11 22:40:06+00:00,7,17,62,Everyone who’s ended up with #hiv has a story ...
947,1.359563e+18,6.115853e+08,Bonn1eGreer,97419,False,2021-02-10 18:00:41+00:00,5,21,62,Know anybody with #HIV?\nI did...a lot of peop...
1447,1.359563e+18,6.115853e+08,Bonn1eGreer,97418,False,2021-02-10 18:00:41+00:00,5,21,62,Know anybody with #HIV?\nI did...a lot of peop...
289,1.359954e+18,6.039860e+08,sexhistorian,2966,False,2021-02-11 19:52:51+00:00,5,1,36,For the next 3 years I'm researching #HIV-affe...
...,...,...,...,...,...,...,...,...,...,...
1993,1.359162e+18,2.642924e+08,sbrentsimpson,3180,False,2021-02-09 15:26:07+00:00,0,0,0,"RT @TashaBrooklynNY: Thank you, @theaidsmemori..."
1994,1.359161e+18,8.090747e+08,ColumbiaSIG,762,False,2021-02-09 15:25:11+00:00,0,0,0,"RT @DrClauStoicescu: 📢 In a new paper, we foun..."
1995,1.359161e+18,1.478306e+09,Edinenno,11,False,2021-02-09 15:24:15+00:00,0,0,0,RT @CDC_HIVAIDS: A CDC study found very low ra...
1996,1.359161e+18,9.851388e+17,watashi_NoUso,2148,False,2021-02-09 15:24:12+00:00,0,0,0,RT @CDC_HIVAIDS: A CDC study found very low ra...


In [34]:
top_10_faves = tweets_df.sort_values(['n_faves', 'n_rts',  'n_replies', ], ascending=False)[:10]
top_10_rt = tweets_df.sort_values(['n_rts',  'n_replies', 'n_faves', ], ascending=False)[:10]
top_10_replies = tweets_df.sort_values(['n_replies', 'n_faves', 'n_rts', ], ascending=False)[:10]

In [75]:
top_u_names = np.append(np.append(top_10_faves.u_name.values, top_10_rt.u_name.values), top_10_replies.u_name.values)

In [76]:
Counter(top_u_names).most_common(10)

[('sexhistorian', 6),
 ('DrRanj', 3),
 ('Bonn1eGreer', 3),
 ('EarlOfSidmouth', 3),
 ('Winnie_Byanyima', 3),
 ('Debbie_abrahams', 2),
 ('BR999', 2),
 ('positivevibesza', 1),
 ('AccidentalHiv', 1),
 ('CedawPT', 1)]

#### sorted by faves

In [41]:
faves_str = " ".join(top_10_faves.text.values)
faves_doc, faves_clean, faves_hashtags,faves_mentions = preprocess_tweets(faves_str)

In [49]:
Counter(faves_hashtags).most_common(10)

[('hiv', 6),
 ('itsasin', 1),
 ('smallpox', 1),
 ('polio', 1),
 ('womeninscience', 1),
 ('hivhometest', 1),
 ('hivtestweek', 1),
 ('uequalsu', 1)]

#### sorted by retweets

In [47]:
rt_str = " ".join(top_10_rt.text.values)
rt_doc, rt_clean, rt_hashtags,rt_mentions = preprocess_tweets(rt_str)

In [50]:
Counter(rt_hashtags).most_common(10)

[('hiv', 6),
 ('womeninscience', 2),
 ('itsasin', 1),
 ('smallpox', 1),
 ('polio', 1),
 ('sdg', 1),
 ('cedaw', 1),
 ('sexualhealth', 1),
 ('openaccess', 1),
 ('freeaccess', 1)]

#### sorted by replies

In [45]:
replies_str = " ".join(top_10_replies.text.values)
replies_doc, replies_clean, replies_hashtags,replies_mentions = preprocess_tweets(replies_str)

In [51]:
Counter(replies_hashtags).most_common(10)

[('hiv', 5),
 ('smallpox', 1),
 ('polio', 1),
 ('itsasin', 1),
 ('hivhometest', 1),
 ('hivtestweek', 1),
 ('tellmeaboutit', 1)]

#### common terms from full doc

In [52]:
h_count.most_common(10)

[('hiv', 77),
 ('plhiv', 16),
 ('endhivstigma', 14),
 ('celebrateblackwomen', 13),
 ('trans', 9),
 ('nbhaad', 8),
 ('uequalsu', 7),
 ('vaccinatethemostvulnerable', 6),
 ('researchmatters', 5),
 ('biotech', 5)]

In [88]:
#grab all common hashtags if they show up more than once in the full tweet doc
common_ht_all = [key for key in h_count.keys() if h_count[key] > 1]

In [55]:
#grab all hashtags that showed up more than once in the top engagement lists
all_top_hashtags = rt_hashtags + replies_hashtags + faves_hashtags
common_ht_top = [key for key in all_top_ht_count.keys() if all_top_ht_count[key] > 1]

In [92]:
#find the overlap between them
common_ht = set(common_ht_all + common_ht_top)
common_ht

{'adherence',
 'aids',
 'alcohol',
 'biotech',
 'celebrateblackwomen',
 'covid19',
 'disclose',
 'endaids',
 'endhivepidemic',
 'endhivstigma',
 'hiv',
 'hivhometest',
 'hivstigma',
 'hivtestweek',
 'hivtreatment',
 'itsasin',
 'lgbt',
 'nbhaad',
 'opioid',
 'plhiv',
 'plwhiv',
 'polio',
 'pr',
 'prep',
 'researchmatters',
 'sciencenotstigma',
 'smallpox',
 'stigma',
 'tbt',
 'trans',
 'transwomen',
 'uequalsu',
 'uniquelyandunapologeticallyblack',
 'vaccinatethemostvulnerable',
 'vaccinatethemostvulneranle',
 'valentines',
 'virology',
 'women',
 'womeninscience'}

In [128]:
#manually drop tags that are too generic or relate more strongly to other topics
ht_to_drop = ['adherence', 'alcohol', 'biotech', 'celebrateblackwomen', 'covid19', 'itsasin', 'lgbt', 'opioid',
             'pr', 'polio', 'researchmatters', 'smallpox', 'tbt', 'trans', 'transwomen', 'uniquelyandunapologeticallyblack', 
             'valentines', 'virology', 'women', 'womeninscience', 'nbhaad', 'stigma']

In [129]:
for ht in ht_to_drop:
    try:
        common_ht.remove(ht)
    except:
        pass

In [130]:
common_ht

{'aids',
 'disclose',
 'endaids',
 'endhivepidemic',
 'endhivstigma',
 'hiv',
 'hivhometest',
 'hivstigma',
 'hivtestweek',
 'hivtreatment',
 'plhiv',
 'plwhiv',
 'prep',
 'sciencenotstigma',
 'uequalsu',
 'vaccinatethemostvulnerable',
 'vaccinatethemostvulneranle'}

In [143]:
#save common hashtags to pickle
pd.to_pickle(common_ht, os.path.join(data.__path__[0], r"compiled/common_hashtags.pkl"))