In [71]:
import pandas as pd
import numpy as np # gensim depends on numpy
import re
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

# The Recovery Group

In [73]:
# 1. Extract the rows with is_recovery = 1
sample = pd.read_csv('recovery_samples_discosure_subs - recovery_samples.csv')
selected_columns = sample[['text', 'is_recovery']]
df_rec = selected_columns.copy()
df_rec.dropna(subset = ['is_recovery'], inplace = True)
df_rec.dropna(subset = ['text'], inplace = True)
df_rec.drop(columns = ['is_recovery'], inplace = True)
df_rec.reset_index(drop = True, inplace = True) 
# reset the index, if not, there will be a gap caused by dropping na values

## Unigram

In [75]:
# 2.1 Define patterns to remove punctuations, numbers, new lines, multiple spaces
pattern_punc = "[^\w\s]"
pattern_num = "[0-9]"
pattern_newline = "\n"
pattern_mulspace = "\s+"
pattern_longline = "___"

In [76]:
# 2.2 Remove punctuation patterns
for i in range(len(df_rec)):
        # lowercase
        df_rec['text'][i] = df_rec['text'][i].lower()
        # remove puncuations
        df_rec['text'][i] = re.sub(pattern_punc, '', df_rec['text'][i])
        # remove numbers
        df_rec['text'][i] = re.sub(pattern_num, '', df_rec['text'][i])
        # remove new line (\n)
        df_rec['text'][i] = re.sub(pattern_newline, ' ', df_rec['text'][i])
        # remove multiple space
        df_rec['text'][i] = re.sub(pattern_mulspace, ' ', df_rec['text'][i])  
        # remove long line
        df_rec['text'][i] = re.sub(pattern_longline, ' ', df_rec['text'][i]) 

In [77]:
# 3.1 Tokenization, put all rows into a list
text_list = df_rec['text'].tolist()

In [78]:
# 3.2 Tokenization using the split function 
word_list = []
for i in text_list:
    if type(i) != str:
        continue
    else:
        words = i.split()
        # word_list.append(words)
        for word in words:
            word_list.append(word)
# Method 2: nltk.word_tokenize('here is your sentence')

In [79]:
len(word_list)

19760

In [80]:
# 4.1 Edit the stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Add some stop words 
add_stop = ['im', 'one', 'also', 'ive', 'etc', 'hi']
stop_words_re = stop_words + add_stop
# remove punctuations from stop words
for i in range(len(stop_words_re)):
    stop_words_re[i] = re.sub(pattern_punc, '', stop_words_re[i])
    word_list_nostop = []

In [81]:
# 4.2 Remove stop words
for word in word_list:
    if word not in stop_words_re:
        word_list_nostop.append(word)

In [82]:
unigram = word_list_nostop 

In [83]:
# 5. Extract top 50 frequent unigram of the recovery group
from collections import Counter
Counter = Counter(unigram)
most_common_unigram= Counter.most_common(50)
print(most_common_unigram)

[('q', 95), ('like', 93), ('people', 83), ('believe', 53), ('would', 46), ('know', 45), ('even', 45), ('time', 44), ('really', 43), ('conspiracy', 43), ('trump', 43), ('world', 42), ('think', 41), ('going', 41), ('qanon', 38), ('right', 35), ('much', 35), ('things', 34), ('want', 33), ('never', 32), ('back', 32), ('lot', 29), ('could', 29), ('way', 27), ('still', 27), ('started', 27), ('covid', 27), ('get', 26), ('everything', 26), ('help', 26), ('say', 25), ('feel', 25), ('true', 24), ('something', 24), ('reality', 24), ('us', 23), ('got', 23), ('family', 23), ('new', 22), ('stuff', 22), ('well', 22), ('government', 22), ('actually', 21), ('made', 21), ('believed', 21), ('story', 21), ('far', 21), ('thing', 20), ('qs', 20), ('years', 20)]


##  Bigram & Trigram (w/ stop words)

In [84]:
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora

In [85]:
# 1. Generate a list with no punctuation, but have stop words
## The reason that keeps stop words for now is that sequence of words matter to express meaning
text_split = []
for i in range(len(text_list)):
    text_split.append(text_list[i].split())
# text_split is a list of list of string tokens

In [86]:
# 2. Create a dictionary
dct = corpora.Dictionary(text_split)
corpus = [dct.doc2bow(line) for line in text_split]

In [87]:
# 3. Build the bigram and trigram model
bigram = gensim.models.phrases.Phrases(text_split, min_count=3, threshold=10)
trigram = gensim.models.phrases.Phrases(bigram[text_split], threshold=5)

In [88]:
# see how the bigram looks 
# print(bigram[text_split[1]])

In [89]:
# 4.1 Generate a flat list of unigram
# Define a function to flatten the lists within list
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist] 
# put unigrams in a flat list 
flat_uni = flatten(text_split)

In [90]:
# 4.2 put bigram in a flat list
bigrams_list = []
for i in range(len(text_split)):
    bigrams_list.append(bigram[text_split[i]])
flat_uni_bi = flatten(bigrams_list)

In [91]:
# 4.3 create a list that only contains bigram, no unigram
flat_bi = []
flat_bi = [item for item in flat_uni_bi if item not in flat_uni]

In [92]:
# 5. Extract top 50 frequent bigram of the recovery group
from collections import Counter
Counter = Counter(flat_bi)
most_common_bigram = Counter.most_common(50)
print(most_common_bigram)

[('to_be', 37), ('i_had', 32), ('i_dont', 30), ('a_lot', 29), ('i_am', 27), ('going_to', 21), ('want_to', 20), ('have_been', 16), ('the_whole', 14), ('a_few', 14), ('im_not', 13), ('to_say', 13), ('part_of', 12), ('they_are', 12), ('the_same', 12), ('as_well', 11), ('deep_state', 11), ('ive_been', 11), ('there_are', 11), ('conspiracy_theories', 10), ('my_story', 10), ('could_be', 10), ('who_are', 10), ('my_life', 10), ('thank_you', 10), ('kind_of', 10), ('people_who', 9), ('rabbit_hole', 9), ('i_realized', 9), ('if_you', 9), ('wanted_to', 9), ('in_order', 9), ('going_on', 9), ('looking_for', 9), ('my_family', 9), ('used_to', 8), ('as_much', 8), ('would_be', 8), ('my_mind', 8), ('people_like', 8), ('my_parents', 8), ('will_be', 8), ('know_what', 8), ('dont_know', 8), ('feel_like', 7), ('sort_of', 7), ('a_year', 7), ('thanks_to', 7), ('made_me', 7), ('my_mom', 7)]


In [93]:
# 6. trigram

In [94]:
# put trigram in a flat list
trigrams_list = []
for i in range(len(text_split)):
    trigrams_list.append(trigram[bigram[text_split[i]]])
flat_uni_bi_tri = flatten(trigrams_list)

In [95]:
#  create a list that only contains trigrams, no bigram, no unigram
flat_tri = []
flat_tri_temp = []
flat_tri_temp = [item for item in flat_uni_bi_tri if item not in flat_bi]
flat_tri = [item for item in flat_tri_temp if item not in flat_uni]

In [96]:
len(flat_tri)

458

In [97]:
# 5. Extract top 50 frequent trigram of the recovery group
from collections import Counter
Counter = Counter(flat_tri)
most_common_trigram = Counter.most_common(50)
print(most_common_trigram)

[('i_was', 67), ('but_i', 41), ('at_the', 27), ('it_was', 26), ('the_world', 25), ('as_a', 21), ('it_is', 21), ('a_lot_of', 21), ('out_of', 20), ('i_just', 17), ('i_think', 17), ('some_of', 15), ('back_to', 14), ('the_most', 13), ('about_it', 13), ('the_government', 11), ('for_me', 10), ('of_these', 10), ('the_deep_state', 9), ('the_election', 9), ('he_was', 8), ('to_take', 8), ('i_cant', 8), ('which_is', 8), ('in_order_to', 7), ('as_much_as', 6), ('i_did', 6)]


# The Non-recovery Group w/ stop words

In [98]:
# 1. Extract the rows with is_recovery being NaN
df_non = selected_columns.copy()
df_non.drop(df_non.index[df_non['is_recovery'] == 1.0], inplace=True)

df_non.drop(columns = ['is_recovery'], inplace = True)
df_non.dropna(subset = ['text'], inplace = True)

df_non.reset_index(drop = True, inplace = True) 

In [99]:
df_non.shape

(202, 1)

In [100]:
# 2.1 Define patterns to remove punctuations, numbers, new lines, multiple spaces
pattern_punc = "[^\w\s]"
pattern_num = "[0-9]"
pattern_newline = "\n"
pattern_mulspace = "\s+"
pattern_longline = "___"

In [101]:
# 2.2 Remove punctuation patterns
for i in range(len(df_non)):
        # remove numbers
        df_non['text'][i] = re.sub(pattern_num, '', df_non['text'][i])
        # lowercase
        df_non['text'][i] = df_non['text'][i].lower()
        # remove puncuations
        df_non['text'][i] = re.sub(pattern_punc, '', df_non['text'][i])
        # remove new line (\n)
        df_non['text'][i] = re.sub(pattern_newline, ' ', df_non['text'][i])
        # remove multiple space
        df_non['text'][i] = re.sub(pattern_mulspace, ' ', df_non['text'][i])  
        # remove long line
        df_non['text'][i] = re.sub(pattern_longline, ' ', df_non['text'][i]) 

In [102]:
# 3.1 Tokenization, put all rows into a list
text_list_non = df_non['text'].tolist()

In [103]:
# 3.2 Tokenization using the split function 
word_list_non = []
for i in text_list_non:
    if type(i) != str:
        continue
    else:
        words = i.split()
        # word_list.append(words)
        for word in words:
            word_list_non.append(word)
# Method 2: nltk.word_tokenize('here is your sentence')

In [104]:
len(word_list_non)

25223

In [105]:
# 4.1 Remove stop words
word_list_non_nostop = []
for word in word_list_non:
    if word not in stop_words_re:
        word_list_non_nostop.append(word)

In [106]:
len(word_list_non_nostop)

12138

In [107]:
unigram_non = word_list_non_nostop

In [108]:
# 5. Extract top 50 frequent unigram of the non-recovery group
from collections import Counter
Counter = Counter(unigram_non)
most_common_unigram_non= Counter.most_common(50)
print(most_common_unigram_non)

[('qanon', 149), ('people', 107), ('like', 107), ('q', 88), ('would', 86), ('know', 82), ('please', 77), ('want', 61), ('share', 59), ('get', 56), ('even', 56), ('anything', 55), ('time', 52), ('conspiracy', 50), ('someone', 47), ('use', 46), ('news', 46), ('follow', 46), ('thread', 45), ('cause', 44), ('feel', 43), ('help', 43), ('videos', 43), ('remember', 43), ('keep', 41), ('really', 40), ('interesting', 40), ('related', 40), ('believe', 38), ('pictures', 38), ('links', 38), ('rules', 38), ('conversations', 38), ('civil', 38), ('anyone', 37), ('still', 36), ('podcasts', 36), ('research', 34), ('make', 33), ('first', 33), ('say', 33), ('back', 33), ('think', 33), ('trump', 32), ('going', 32), ('always', 31), ('could', 31), ('theories', 30), ('much', 30), ('see', 30)]


## Bigram & Trigram - non-recovery

In [112]:
len(text_list_non)

202

In [113]:
# 1. Generate a list with no punctuation, but have stop words
## The reason that keeps stop words for now is that sequence of words matter to express meaning
text_split_non = []
for i in range(len(text_list_non)):
    text_split_non.append(text_list_non[i].split())
# text_split_non is a list of list of string tokens

In [114]:
# 2. Create a dictionary
dct_non = corpora.Dictionary(text_split_non)
corpus_non = [dct_non.doc2bow(line) for line in text_split_non]

In [115]:
# 3. Build the bigram and trigram model
bigram_non = gensim.models.phrases.Phrases(text_split_non, min_count=3, threshold=5)
trigram_non = gensim.models.phrases.Phrases(bigram_non[text_split_non], threshold=5)

In [116]:
# see how the bigram looks 
print(bigram_non[text_split_non[1]])

['i_have', 'created', 'this', 'subreddit', 'for_those', 'who_have', 'escaped', 'qanon', 'feel_free', 'to_share', 'your', 'stories', 'and', 'your', 'struggles', 'how_did', 'you', 'get', 'into_q', 'how_did', 'you', 'get', 'out', 'this_is', 'the', 'sistersubreddit', 'of', 'rqanoncasualties', 'i_hope', 'this', 'community', 'will_be', 'as', 'helpful', 'to', 'you', 'as', 'that', 'community', 'has_been', 'for_those', 'who_have', 'lost', 'family', 'and', 'friends', 'to', 'q']


In [117]:
# 4.1 Generate a flat list of unigram
# put unigrams in a flat list 
flat_uni_non = flatten(text_split_non)

In [118]:
# 4.2 put bigram in a flat list
bigrams_list_non = []
for i in range(len(text_split_non)):
    bigrams_list_non.append(bigram_non[text_split_non[i]])
flat_uni_bi_non = flatten(bigrams_list_non)

In [119]:
# 4.3 create a list that only contains bigram, no unigram
flat_bi_non = []
flat_bi_non = [item for item in flat_uni_bi_non if item not in flat_uni_non]

In [120]:
# 5. Extract top 50 frequent bigram of the non-recovery group
from collections import Counter
Counter = Counter(flat_bi_non)
most_common_bigram_non = Counter.most_common(50)
print(most_common_bigram_non)

[('i_am', 72), ('can_be', 52), ('this_is', 39), ('to_qanon', 39), ('and_our', 38), ('share_anything', 37), ('to_follow', 37), ('use_this', 36), ('thread_to', 36), ('interesting_related', 36), ('cause_this', 36), ('pictures_news', 36), ('links_podcasts', 36), ('videos_etc', 36), ('please_remember', 36), ('our_rules', 36), ('and_keep', 36), ('conversations_civil', 36), ('if_you', 34), ('i_was', 34), ('but_i', 31), ('i_have', 27), ('it_was', 27), ('thank_you', 26), ('you_can', 25), ('i_dont', 25), ('i_would', 24), ('out_of', 23), ('we_are', 23), ('want_to', 22), ('i_will', 22), ('would_be', 22), ('they_are', 21), ('people_who', 21), ('im_a', 21), ('some_of', 21), ('i_want', 19), ('to_hear', 19), ('as_a', 19), ('will_be', 18), ('has_been', 18), ('a_lot', 18), ('to_get', 18), ('someone_who', 17), ('going_to', 17), ('trying_to', 16), ('conspiracy_theories', 16), ('have_been', 16), ('to_do', 16), ('believe_in', 15)]


In [121]:
# 6. trigram

In [122]:
# put trigram in a flat list
trigrams_list_non = []
for i in range(len(text_split_non)):
    trigrams_list_non.append(trigram_non[bigram_non[text_split_non[i]]])
flat_uni_bi_tri_non = flatten(trigrams_list_non)

In [123]:
#  create a list that only contains trigrams, no bigram, no unigram
flat_tri_non = []
flat_tri_temp_non = []
flat_tri_temp_non = [item for item in flat_uni_bi_tri_non if item not in flat_bi_non]
flat_tri_non = [item for item in flat_tri_temp_non if item not in flat_uni_non]

In [306]:
# 5. Extract top 50 frequent trigram of the recovery group
from collections import Counter
Counter = Counter(flat_tri_non)
most_common_trigram_non = Counter.most_common(50)
print(most_common_trigram_non)

[('use_this_thread_to', 36), ('share_anything_interesting_related', 36), ('to_qanon_and_our', 36), ('cause_this_can_be', 36), ('pictures_news_links_podcasts', 36), ('videos_etc_please_remember', 36), ('to_follow_our_rules', 36), ('and_keep_conversations_civil', 36), ('to_be', 32), ('it_is', 19), ('i_can', 16), ('so_i', 16), ('i_just', 13), ('i_want_to', 12), ('to_speak_to', 11), ('my_name_is', 9), ('a_lot_of', 9), ('some_of_the', 9), ('people_who_are', 8), ('we_can', 8), ('if_you_are', 7), ('to_hear_from', 7), ('feel_free_to', 6), ('email_me_at', 6), ('working_on_a', 6), ('i_would_like', 6), ('i_could', 6), ('the_rabbit_hole', 6), ('if_this_is', 6), ('we_want_to_hear', 6)]


##  Bigram (w/o stop words) - Recovery

In [328]:
# 1. Generate a list with no punctuation, but have stop words
## The reason that keeps stop words for now is that sequence of words matter to express meaning
text_split = []
for i in range(len(text_list)):
    text_split.append(text_list[i].split())
# text_split is a list of list of string tokens

In [329]:
# 2. Remove stop words
# 2.1 Edit the stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Add some stop words 
add_stop = ['im', 'one', 'also', 'ive', 'etc', 'hi']
stop_words_re = stop_words + add_stop
# remove punctuations from stop words
for i in range(len(stop_words_re)):
    stop_words_re[i] = re.sub(pattern_punc, '', stop_words_re[i])

In [330]:
# 2.2. remove stop_words_re (list of string) from text_split (a list of list of string)
# save a copy of text_split
text_split_no = text_split.copy()
# removal
for i in range(len(text_split_no)):
    for word in text_split_no[i]:
        if word in stop_words_re:
            text_split_no[i].remove(word)

In [331]:
# 3. Create a dictionary
dct = corpora.Dictionary(text_split_no)
corpus = [dct.doc2bow(line) for line in text_split_no]

In [337]:
# 4. Build the bigram and trigram model
bigram_nostop = gensim.models.phrases.Phrases(text_split_no, min_count=3, threshold=10)
trigram_nostop = gensim.models.phrases.Phrases(bigram_nostop[text_split_no], threshold=5)

In [338]:
# 5.1 Generate a flat list of unigram
# Define a function to flatten the lists within list
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist] 
# put unigrams in a flat list 
flat_uni = flatten(text_split)

In [339]:
# 5.2 put bigram in a flat list
bigrams_list_nostop = []
for i in range(len(text_split_no)):
    bigrams_list_nostop.append(bigram_nostop[text_split_no[i]])
flat_uni_bi = flatten(bigrams_list_nostop)

In [340]:
# 5.3 create a list that only contains bigram, no unigram
flat_bi_nostop = []
flat_bi_nostop = [item for item in flat_uni_bi if item not in flat_uni]

In [342]:
# 6. Extract top 50 frequent bigram of the recovery group
from collections import Counter
Counter = Counter(flat_bi_nostop)
most_common_bigram_nostop = Counter.most_common(50)
print(most_common_bigram_nostop)

[('a_lot', 18), ('i_dont', 13), ('deep_state', 11), ('conspiracy_theories', 10), ('rabbit_hole', 9), ('people_like', 9), ('i_am', 9), ('but_i', 8), ('my_parents', 8), ('anyone_else', 7), ('feel_like', 7), ('want_to', 7), ('the_same', 7), ('podesta_emails', 6), ('would_be', 6), ('my_mind', 6), ('social_media', 6), ('my_life', 6), ('mainstream_media', 6), ('dont_know', 6), ('q_anon', 5), ('mental_health', 5), ('this_cult', 5), ('make_sense', 5), ('years_old', 5), ('might_be', 5), ('to_be', 5), ('moms_side', 5), ('a_person', 5), ('far_right', 5), ('both_sides', 5), ('even_though', 4), ('conspiracy_theory', 4), ('when_trump', 4), ('many_people', 4), ('never_really', 4), ('qs_claims', 4), ('sealed_indictments', 4), ('campaigns_like', 4), ('everything_connected', 4), ('great_awakening', 4), ('economic_elites', 4), ('internally_consistent', 4), ('q_drops', 4), ('collective_sensemaking', 4), ('they_are', 4), ('it_seems', 4), ('have_lost', 4), ('health_crisis', 2), ('dont_want', 2)]


In [343]:
recovery_counts = dict(Counter)

##  Bigram (w/o stop words) - Non-recovery

In [344]:
# 0. Load the data of non-recovery group - Extract the rows with is_recovery being NaN
df_non = selected_columns.copy()
df_non.drop(df_non.index[df_non['is_recovery'] == 1.0], inplace=True)
df_non.drop(columns = ['is_recovery'], inplace = True)
df_non.dropna(subset = ['text'], inplace = True)
df_non.reset_index(drop = True, inplace = True) 

In [345]:
df_non.shape

(202, 1)

In [346]:
# 0.1 Define patterns to remove punctuations, numbers, new lines, multiple spaces
pattern_punc = "[^\w\s]"
pattern_num = "[0-9]"
pattern_newline = "\n"
pattern_mulspace = "\s+"
pattern_longline = "___"

In [347]:
# 0.2 Remove punctuation patterns
for i in range(len(df_non)):
        # remove numbers
        df_non['text'][i] = re.sub(pattern_num, '', df_non['text'][i])
        # lowercase
        df_non['text'][i] = df_non['text'][i].lower()
        # remove puncuations
        df_non['text'][i] = re.sub(pattern_punc, '', df_non['text'][i])
        # remove new line (\n)
        df_non['text'][i] = re.sub(pattern_newline, ' ', df_non['text'][i])
        # remove multiple space
        df_non['text'][i] = re.sub(pattern_mulspace, ' ', df_non['text'][i])  
        # remove long line
        df_non['text'][i] = re.sub(pattern_longline, ' ', df_non['text'][i]) 

In [348]:
# 1.1 Tokenization, put all rows into a list
text_list_non = df_non['text'].tolist()

In [349]:
# 2.1. Generate a list with no punctuation, but have stop words
text_split_non = []
for i in range(len(text_list_non)):
    text_split_non.append(text_list_non[i].split())
# text_split_non is a list of list of string tokens

In [350]:
# 2.2. Remove stop words
# 2.2.1 Edit the stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Add some stop words 
add_stop = ['im', 'one', 'also', 'ive', 'etc', 'hi']
stop_words_re = stop_words + add_stop
# remove punctuations from stop words
for i in range(len(stop_words_re)):
    stop_words_re[i] = re.sub(pattern_punc, '', stop_words_re[i])

In [351]:
# 2.2.2 remove stop_words_re (list of string) from text_split (a list of list of string)
# save a copy of text_split
non_text_split_no = text_split_non.copy()
# removal
for i in range(len(non_text_split_no)):
    for word in non_text_split_no[i]:
        if word in stop_words_re:
            non_text_split_no[i].remove(word)

In [352]:
len(non_text_split_no)

202

In [353]:
# 3. Create a dictionary
dct_non = corpora.Dictionary(non_text_split_no)
corpus_non = [dct.doc2bow(line) for line in non_text_split_no]

In [354]:
# 4. Build the bigram and trigram model
bigram_non = gensim.models.phrases.Phrases(non_text_split_no, min_count=3, threshold=5)
trigram_non = gensim.models.phrases.Phrases(bigram_non[non_text_split_no], threshold=5)

In [355]:
# 5.1 Generate a flat list of unigram
# Define a function to flatten the lists within list
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist] 
# put unigrams in a flat list 
non_flat_uni = flatten(text_split_non)

In [356]:
# 5.2 put bigram in a flat list
non_bigrams_list = []
for i in range(len(non_text_split_no)):
    non_bigrams_list.append(bigram_non[non_text_split_no[i]])
non_flat_uni_bi = flatten(non_bigrams_list)

In [357]:
# 5.3 create a list that only contains bigram, no unigram
non_flat_bi = []
non_flat_bi = [item for item in non_flat_uni_bi if item not in non_flat_uni]

In [392]:
# 6. Extract top 50 frequent bigram of the non-recovery group
from collections import Counter
Counter_non = Counter(non_flat_bi)
most_common_bigram_non = Counter_non.most_common(50)
print(most_common_bigram_non)

[('share_anything', 37), ('use_thread', 36), ('interesting_related', 36), ('qanon_cause', 36), ('can_pictures', 36), ('news_links', 36), ('podcasts_videos', 36), ('please_remember', 36), ('follow_our', 36), ('rules_keep', 36), ('conversations_civil', 36), ('would_like', 17), ('conspiracy_theories', 16), ('thank_you', 14), ('i_am', 13), ('the_time', 11), ('i_have', 10), ('i_was', 10), ('i_want', 9), ('a_lot', 9), ('inclusion_criteria', 9), ('feel_free', 8), ('i_dont', 8), ('the_theory', 8), ('the_mod', 8), ('want_hear', 7), ('so_much', 7), ('even_though', 7), ('the_world', 7), ('we_will', 7), ('feel_like', 7), ('qanon_conspiracy', 6), ('conspiracy_theory', 6), ('media_requests', 6), ('id_love', 6), ('please_send', 6), ('the_same', 6), ('loved_ones', 6), ('the_rabbit', 6), ('am_looking', 6), ('know_someone', 6), ('max_erdemandi', 6), ('those_have', 5), ('help_people', 5), ('qanon_followers', 5), ('the_new', 5), ('this_sub', 5), ('this_thread', 5), ('please_feel', 5), ('people_are', 5)]


### Bigrams w/o stop words, all posts (rec & non-rec)

In [359]:
sample = pd.read_csv('recovery_samples_discosure_subs - recovery_samples.csv')
all_post = sample[['text']] # include both recovery and non-recovery groups
df_all = all_post.copy()
df_all.dropna(subset = ['text'], inplace = True) # remove rows with empty text columns
df_all.reset_index(drop = True, inplace = True) 
# reset the index, if not, there will be a gap caused by dropping na values

In [360]:
df_all.shape

(242, 1)

In [361]:
# 2.1 Define patterns to remove punctuations, numbers, new lines, multiple spaces
pattern_punc = "[^\w\s]"
pattern_num = "[0-9]"
pattern_newline = "\n"
pattern_mulspace = "\s+"
pattern_longline = "___"

In [362]:
# 2.2 Remove punctuation patterns
for i in range(len(df_all)):
        # lowercase
        df_all['text'][i] = df_all['text'][i].lower()
        # remove puncuations
        df_all['text'][i] = re.sub(pattern_punc, '', df_all['text'][i])
        # remove numbers
        df_all['text'][i] = re.sub(pattern_num, '', df_all['text'][i])
        # remove new line (\n)
        df_all['text'][i] = re.sub(pattern_newline, ' ', df_all['text'][i])
        # remove multiple space
        df_all['text'][i] = re.sub(pattern_mulspace, ' ', df_all['text'][i])  
        # remove long line
        df_all['text'][i] = re.sub(pattern_longline, ' ', df_all['text'][i]) 

In [363]:
# 3.1 Tokenization, put all rows into a list
all_text_list = df_all['text'].tolist()

In [364]:
# 3.2 Tokenization using the split function 
all_word_list = []
for i in all_text_list:
    if type(i) != str:
        continue
    else:
        words = i.split()
        # word_list.append(words)
        for word in words:
            all_word_list.append(word)
# Method 2: nltk.word_tokenize('here is your sentence')

In [365]:
# 4.1 Edit the stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Add some stop words 
add_stop = ['im', 'one', 'also', 'ive', 'etc', 'hi']
stop_words_re = stop_words + add_stop
# remove punctuations from stop words
for i in range(len(stop_words_re)):
    stop_words_re[i] = re.sub(pattern_punc, '', stop_words_re[i])

In [366]:
all_word_list_nostop = []
# 4.2 Remove stop words
for word in all_word_list:
    if word not in stop_words_re:
        all_word_list_nostop.append(word)

In [367]:
unigram_all = all_word_list_nostop

In [368]:
# 5. Extract top 50 frequent unigram of the recovery group
from collections import Counter
Counter_all = Counter(unigram_all)
most_common_unigram_all= Counter_all.most_common(50)
print(most_common_unigram_all)

[('like', 200), ('people', 190), ('qanon', 187), ('q', 183), ('would', 132), ('know', 127), ('even', 101), ('time', 96), ('want', 94), ('conspiracy', 93), ('believe', 91), ('really', 83), ('please', 83), ('get', 82), ('trump', 75), ('think', 74), ('going', 73), ('help', 69), ('feel', 68), ('much', 65), ('news', 65), ('back', 65), ('anything', 64), ('world', 64), ('share', 64), ('still', 63), ('someone', 61), ('things', 60), ('could', 60), ('right', 59), ('say', 58), ('never', 54), ('anyone', 53), ('follow', 53), ('us', 53), ('way', 53), ('videos', 53), ('family', 52), ('got', 51), ('first', 51), ('use', 51), ('remember', 50), ('something', 50), ('lot', 49), ('see', 49), ('go', 48), ('story', 48), ('thread', 48), ('said', 47), ('always', 47)]


### bigram

In [369]:
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora

In [370]:
# 1. Generate a list with no punctuation, but have stop words
text_split_all = []
for i in range(len(all_text_list)):
    text_split_all.append(all_text_list[i].split())
# text_split is a list of list of string tokens

In [371]:
# 2. Remove stop words
# 2.1 Edit the stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Add some stop words 
add_stop = ['im', 'one', 'also', 'ive', 'etc', 'hi']
stop_words_re = stop_words + add_stop
# remove punctuations from stop words
for i in range(len(stop_words_re)):
    stop_words_re[i] = re.sub(pattern_punc, '', stop_words_re[i])

In [372]:
# 2.2. remove stop_words_re (list of string) from text_split (a list of list of string)
# save a copy of text_split
text_split_no_all = text_split_all.copy()
# removal
for i in range(len(text_split_no_all)):
    for word in text_split_no_all[i]:
        if word in stop_words_re:
            text_split_no_all[i].remove(word)

In [373]:
# 3. Create a dictionary
dct_all = corpora.Dictionary(text_split_no_all)
corpus_all = [dct.doc2bow(line) for line in text_split_no_all]

In [374]:
# 4. Build the bigram and trigram model
bigram_all = gensim.models.phrases.Phrases(text_split_no_all, min_count=3, threshold=5)
trigram_all = gensim.models.phrases.Phrases(bigram_all[text_split_no_all], threshold=5)

In [375]:
# 5.1 Generate a flat list of unigram
# Define a function to flatten the lists within list
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist] 
# put unigrams in a flat list 
flat_uni_all = flatten(text_split_no_all)

In [376]:
# 5.2 put bigram in a flat list
bigrams_list_all = []
for i in range(len(text_split_no_all)):
    bigrams_list_all.append(bigram_all[text_split_no_all[i]])
flat_uni_bi_all = flatten(bigrams_list_all)

In [377]:
# 5.3 create a list that only contains bigram, no unigram
flat_bi_all = []
flat_bi_all = [item for item in flat_uni_bi_all if item not in flat_uni_all]

In [378]:
# 6. Extract top 50 frequent bigram of the recovery group
from collections import Counter
Counter_all = Counter(flat_bi_all)
most_common_bigram_all = Counter_all.most_common(50)
print(most_common_bigram_all)

[('share_anything', 37), ('use_thread', 36), ('interesting_related', 36), ('qanon_cause', 36), ('can_pictures', 36), ('news_links', 36), ('podcasts_videos', 36), ('please_remember', 36), ('follow_our', 36), ('rules_keep', 36), ('conversations_civil', 36), ('a_lot', 27), ('conspiracy_theories', 27), ('i_was', 24), ('the_world', 24), ('i_dont', 21), ('i_am', 21), ('would_like', 18), ('i_have', 16), ('thank_you', 16), ('deep_state', 15), ('i_want', 14), ('the_same', 13), ('but_i', 12), ('the_first', 12), ('even_though', 11), ('conspiracy_theory', 11), ('feel_like', 11), ('rabbit_hole', 11), ('would_be', 11), ('my_parents', 11), ('anyone_else', 10), ('the_whole', 10), ('i_had', 10), ('people_like', 10), ('it_is', 10), ('i_feel', 9), ('it_was', 9), ('i_just', 9), ('my_life', 9), ('this_is', 9), ('people_are', 9), ('mainstream_media', 9), ('the_new', 9), ('the_theory', 9), ('inclusion_criteria', 9), ('i_cant', 8), ('my_mind', 8), ('many_people', 8), ('social_media', 8)]


In [379]:
base_counts = dict(Counter_all)

### Save base_counts to tsv file

In [380]:
df_base = pd.DataFrame.from_dict(base_counts, orient='index').reset_index()

In [381]:
df_base = df_base.rename(columns={'index':'bigram', 0:'count'})

In [382]:
df_base.to_csv("base_counts.tsv", sep="\t", index=False, header=False)

### Save recovery_counts to tsv file

In [387]:
df_recovery = pd.DataFrame.from_dict(recovery_counts, orient='index').reset_index()

In [388]:
df_recovery = df_recovery.rename(columns={'index':'bigram', 0:'count'})

In [391]:
df_recovery.to_csv("recovery_counts.tsv", sep="\t", index=False, header=False)