In [1]:
import pandas as pd
import numpy as np # gensim depends on numpy
import re
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

# The Recovery Group

In [2]:
# 1. Extract the rows with is_recovery = 1
sample = pd.read_csv('recovery_samples_discosure_subs - recovery_samples.csv')
selected_columns = sample[['text', 'is_recovery']]
df_rec = selected_columns.copy()
df_rec.dropna(subset = ['is_recovery'], inplace = True)
df_rec.dropna(subset = ['text'], inplace = True)
df_rec.drop(columns = ['is_recovery'], inplace = True)
df_rec.reset_index(drop = True, inplace = True) 
# reset the index, if not, there will be a gap caused by dropping na values

## Unigram

In [3]:
# 2.1 Define patterns to remove punctuations, numbers, new lines, multiple spaces
pattern_punc = "[^\w\s]"
pattern_num = "[0-9]"
pattern_newline = "\n"
pattern_mulspace = "\s+"
pattern_longline = "___"

In [4]:
# 2.2 Remove punctuation patterns
for i in range(len(df_rec)):
        # lowercase
        df_rec['text'][i] = df_rec['text'][i].lower()
        # remove puncuations
        df_rec['text'][i] = re.sub(pattern_punc, '', df_rec['text'][i])
        # remove numbers
        df_rec['text'][i] = re.sub(pattern_num, '', df_rec['text'][i])
        # remove new line (\n)
        df_rec['text'][i] = re.sub(pattern_newline, ' ', df_rec['text'][i])
        # remove multiple space
        df_rec['text'][i] = re.sub(pattern_mulspace, ' ', df_rec['text'][i])  
        # remove long line
        df_rec['text'][i] = re.sub(pattern_longline, ' ', df_rec['text'][i]) 

In [5]:
# 3.1 Tokenization, put all rows into a list
text_list = df_rec['text'].tolist()

In [6]:
# 3.2 Tokenization using the split function 
word_list = []
for i in text_list:
    if type(i) != str:
        continue
    else:
        words = i.split()
        # word_list.append(words)
        for word in words:
            word_list.append(word)
# Method 2: nltk.word_tokenize('here is your sentence')

In [7]:
len(word_list)

19760

In [8]:
# 4.1 Edit the stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Add some stop words 
add_stop = ['im', 'one', 'also', 'ive', 'etc', 'hi']
stop_words_re = stop_words + add_stop
# remove punctuations from stop words
for i in range(len(stop_words_re)):
    stop_words_re[i] = re.sub(pattern_punc, '', stop_words_re[i])
    word_list_nostop = []

In [9]:
# 4.2 Remove stop words
for word in word_list:
    if word not in stop_words_re:
        word_list_nostop.append(word)

In [10]:
unigram = word_list_nostop 

In [11]:
# 5. Extract top 50 frequent unigram of the recovery group
from collections import Counter
Counter = Counter(unigram)
most_common_unigram= Counter.most_common(50)
print(most_common_unigram)

[('q', 95), ('like', 93), ('people', 83), ('believe', 53), ('would', 46), ('know', 45), ('even', 45), ('time', 44), ('really', 43), ('conspiracy', 43), ('trump', 43), ('world', 42), ('think', 41), ('going', 41), ('qanon', 38), ('right', 35), ('much', 35), ('things', 34), ('want', 33), ('never', 32), ('back', 32), ('lot', 29), ('could', 29), ('way', 27), ('still', 27), ('started', 27), ('covid', 27), ('get', 26), ('everything', 26), ('help', 26), ('say', 25), ('feel', 25), ('true', 24), ('something', 24), ('reality', 24), ('us', 23), ('got', 23), ('family', 23), ('new', 22), ('stuff', 22), ('well', 22), ('government', 22), ('actually', 21), ('made', 21), ('believed', 21), ('story', 21), ('far', 21), ('thing', 20), ('qs', 20), ('years', 20)]


##  Bigram & Trigram

In [12]:
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora

In [13]:
# 1. Generate a list with no punctuation, but have stop words
## The reason that keeps stop words for now is that sequence of words matter to express meaning
text_split = []
for i in range(len(text_list)):
    text_split.append(text_list[i].split())
# text_split is a list of list of string tokens

In [14]:
# 2. Create a dictionary
dct = corpora.Dictionary(text_split)
corpus = [dct.doc2bow(line) for line in text_split]

In [15]:
# 3. Build the bigram and trigram model
bigram = gensim.models.phrases.Phrases(text_split, min_count=3, threshold=5)
trigram = gensim.models.phrases.Phrases(bigram[text_split], threshold=5)

In [16]:
# see how the bigram looks 
# print(bigram[text_split[1]])

In [17]:
# 4.1 Generate a flat list of unigram
# Define a function to flatten the lists within list
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist] 
# put unigrams in a flat list 
flat_uni = flatten(text_split)

In [18]:
# 4.2 put bigram in a flat list
bigrams_list = []
for i in range(len(text_split)):
    bigrams_list.append(bigram[text_split[i]])
flat_uni_bi = flatten(bigrams_list)

In [19]:
# 4.3 create a list that only contains bigram, no unigram
flat_bi = []
flat_bi = [item for item in flat_uni_bi if item not in flat_uni]

In [20]:
# 5. Extract top 50 frequent bigram of the recovery group
from collections import Counter
Counter = Counter(flat_bi)
most_common_bigram = Counter.most_common(50)
print(most_common_bigram)

[('i_was', 64), ('but_i', 43), ('to_be', 37), ('i_had', 31), ('a_lot', 29), ('at_the', 29), ('i_dont', 29), ('it_was', 26), ('i_am', 26), ('the_world', 24), ('as_a', 22), ('going_to', 21), ('it_is', 21), ('want_to', 20), ('out_of', 20), ('i_just', 17), ('i_think', 17), ('have_been', 16), ('some_of', 15), ('the_whole', 14), ('back_to', 14), ('a_few', 14), ('im_not', 13), ('the_most', 13), ('to_say', 13), ('i_do', 13), ('when_i', 12), ('about_it', 12), ('i_feel', 11), ('as_well', 11), ('the_government', 11), ('they_are', 11), ('ive_been', 11), ('there_are', 11), ('for_me', 10), ('my_story', 10), ('the_deep', 10), ('could_be', 10), ('who_are', 10), ('of_these', 10), ('my_life', 10), ('the_same', 10), ('thank_you', 10), ('kind_of', 10), ('like_this', 9), ('people_who', 9), ('rabbit_hole', 9), ('if_you', 9), ('wanted_to', 9), ('in_order', 9)]


In [21]:
# 6. trigram

In [22]:
# put trigram in a flat list
trigrams_list = []
for i in range(len(text_split)):
    trigrams_list.append(trigram[bigram[text_split[i]]])
flat_uni_bi_tri = flatten(trigrams_list)

In [23]:
#  create a list that only contains trigrams, no bigram, no unigram
flat_tri = []
flat_tri_temp = []
flat_tri_temp = [item for item in flat_uni_bi_tri if item not in flat_bi]
flat_tri = [item for item in flat_tri_temp if item not in flat_uni]

In [24]:
len(flat_tri)

92

In [25]:
# 5. Extract top 50 frequent trigram of the recovery group
from collections import Counter
Counter = Counter(flat_tri)
most_common_trigram = Counter.most_common(50)
print(most_common_trigram)

[('a_lot_of', 21), ('i_would', 10), ('to_do', 10), ('the_deep_state', 9), ('i_never', 8), ('i_know', 8), ('in_order_to', 7), ('at_the_time', 7), ('as_much_as', 6), ('a_part_of', 6)]


# The Non-recovery Group

In [26]:
# 1. Extract the rows with is_recovery being NaN
df_non = selected_columns.copy()
df_non.drop(df_non.index[df_non['is_recovery'] == 1.0], inplace=True)

df_non.drop(columns = ['is_recovery'], inplace = True)
df_non.dropna(subset = ['text'], inplace = True)

df_non.reset_index(drop = True, inplace = True) 

In [27]:
df_non.shape

(202, 1)

In [28]:
# 2.1 Define patterns to remove punctuations, numbers, new lines, multiple spaces
pattern_punc = "[^\w\s]"
pattern_num = "[0-9]"
pattern_newline = "\n"
pattern_mulspace = "\s+"
pattern_longline = "___"

In [29]:
# 2.2 Remove punctuation patterns
for i in range(len(df_non)):
        # remove numbers
        df_non['text'][i] = re.sub(pattern_num, '', df_non['text'][i])
        # lowercase
        df_non['text'][i] = df_non['text'][i].lower()
        # remove puncuations
        df_non['text'][i] = re.sub(pattern_punc, '', df_non['text'][i])
        # remove new line (\n)
        df_non['text'][i] = re.sub(pattern_newline, ' ', df_non['text'][i])
        # remove multiple space
        df_non['text'][i] = re.sub(pattern_mulspace, ' ', df_non['text'][i])  
        # remove long line
        df_non['text'][i] = re.sub(pattern_longline, ' ', df_non['text'][i]) 

In [30]:
# 3.1 Tokenization, put all rows into a list
text_list_non = df_non['text'].tolist()

In [31]:
# 3.2 Tokenization using the split function 
word_list_non = []
for i in text_list_non:
    if type(i) != str:
        continue
    else:
        words = i.split()
        # word_list.append(words)
        for word in words:
            word_list_non.append(word)
# Method 2: nltk.word_tokenize('here is your sentence')

In [32]:
len(word_list_non)

25223

In [33]:
# 4.1 Remove stop words
word_list_non_nostop = []
for word in word_list_non:
    if word not in stop_words_re:
        word_list_non_nostop.append(word)

In [34]:
len(word_list_non_nostop)

12138

In [35]:
unigram_non = word_list_non_nostop

In [36]:
# 5. Extract top 50 frequent unigram of the non-recovery group
from collections import Counter
Counter = Counter(unigram_non)
most_common_unigram_non= Counter.most_common(50)
print(most_common_unigram_non)

[('qanon', 149), ('people', 107), ('like', 107), ('q', 88), ('would', 86), ('know', 82), ('please', 77), ('want', 61), ('share', 59), ('get', 56), ('even', 56), ('anything', 55), ('time', 52), ('conspiracy', 50), ('someone', 47), ('use', 46), ('news', 46), ('follow', 46), ('thread', 45), ('cause', 44), ('feel', 43), ('help', 43), ('videos', 43), ('remember', 43), ('keep', 41), ('really', 40), ('interesting', 40), ('related', 40), ('believe', 38), ('pictures', 38), ('links', 38), ('rules', 38), ('conversations', 38), ('civil', 38), ('anyone', 37), ('still', 36), ('podcasts', 36), ('research', 34), ('make', 33), ('first', 33), ('say', 33), ('back', 33), ('think', 33), ('trump', 32), ('going', 32), ('always', 31), ('could', 31), ('theories', 30), ('much', 30), ('see', 30)]


## Bigram & Trigram

In [37]:
len(text_list_non)

202

In [38]:
# 1. Generate a list with no punctuation, but have stop words
## The reason that keeps stop words for now is that sequence of words matter to express meaning
text_split_non = []
for i in range(len(text_list_non)):
    text_split_non.append(text_list_non[i].split())
# text_split_non is a list of list of string tokens

In [39]:
# 2. Create a dictionary
dct_non = corpora.Dictionary(text_split_non)
corpus_non = [dct_non.doc2bow(line) for line in text_split_non]

In [40]:
# 3. Build the bigram and trigram model
bigram_non = gensim.models.phrases.Phrases(text_split_non, min_count=3, threshold=5)
trigram_non = gensim.models.phrases.Phrases(bigram_non[text_split_non], threshold=5)

In [41]:
# see how the bigram looks 
print(bigram_non[text_split_non[1]])

['i_have', 'created', 'this', 'subreddit', 'for_those', 'who_have', 'escaped', 'qanon', 'feel_free', 'to_share', 'your', 'stories', 'and', 'your', 'struggles', 'how_did', 'you', 'get', 'into_q', 'how_did', 'you', 'get', 'out', 'this_is', 'the', 'sistersubreddit', 'of', 'rqanoncasualties', 'i_hope', 'this', 'community', 'will_be', 'as', 'helpful', 'to', 'you', 'as', 'that', 'community', 'has_been', 'for_those', 'who_have', 'lost', 'family', 'and', 'friends', 'to', 'q']


In [42]:
# 4.1 Generate a flat list of unigram
# put unigrams in a flat list 
flat_uni_non = flatten(text_split_non)

In [43]:
# 4.2 put bigram in a flat list
bigrams_list_non = []
for i in range(len(text_split_non)):
    bigrams_list_non.append(bigram_non[text_split_non[i]])
flat_uni_bi_non = flatten(bigrams_list_non)

In [44]:
# 4.3 create a list that only contains bigram, no unigram
flat_bi_non = []
flat_bi_non = [item for item in flat_uni_bi_non if item not in flat_uni_non]

In [45]:
# 5. Extract top 50 frequent bigram of the recovery group
from collections import Counter
Counter = Counter(flat_bi_non)
most_common_bigram_non = Counter.most_common(50)
print(most_common_bigram_non)

[('i_am', 72), ('can_be', 52), ('this_is', 39), ('to_qanon', 39), ('and_our', 38), ('share_anything', 37), ('to_follow', 37), ('use_this', 36), ('thread_to', 36), ('interesting_related', 36), ('cause_this', 36), ('pictures_news', 36), ('links_podcasts', 36), ('videos_etc', 36), ('please_remember', 36), ('our_rules', 36), ('and_keep', 36), ('conversations_civil', 36), ('if_you', 34), ('i_was', 34), ('but_i', 31), ('i_have', 27), ('it_was', 27), ('thank_you', 26), ('you_can', 25), ('i_dont', 25), ('i_would', 24), ('out_of', 23), ('we_are', 23), ('want_to', 22), ('i_will', 22), ('would_be', 22), ('they_are', 21), ('people_who', 21), ('im_a', 21), ('some_of', 21), ('i_want', 19), ('to_hear', 19), ('as_a', 19), ('will_be', 18), ('has_been', 18), ('a_lot', 18), ('to_get', 18), ('someone_who', 17), ('going_to', 17), ('trying_to', 16), ('conspiracy_theories', 16), ('have_been', 16), ('to_do', 16), ('believe_in', 15)]


In [46]:
# 6. trigram

In [47]:
# put trigram in a flat list
trigrams_list_non = []
for i in range(len(text_split_non)):
    trigrams_list_non.append(trigram_non[bigram_non[text_split_non[i]]])
flat_uni_bi_tri_non = flatten(trigrams_list_non)

In [48]:
#  create a list that only contains trigrams, no bigram, no unigram
flat_tri_non = []
flat_tri_temp_non = []
flat_tri_temp_non = [item for item in flat_uni_bi_tri_non if item not in flat_bi_non]
flat_tri_non = [item for item in flat_tri_temp_non if item not in flat_uni_non]

In [49]:
# 5. Extract top 50 frequent trigram of the recovery group
from collections import Counter
Counter = Counter(flat_tri_non)
most_common_trigram_non = Counter.most_common(50)
print(most_common_trigram_non)

[('use_this_thread_to', 36), ('share_anything_interesting_related', 36), ('to_qanon_and_our', 36), ('cause_this_can_be', 36), ('pictures_news_links_podcasts', 36), ('videos_etc_please_remember', 36), ('to_follow_our_rules', 36), ('and_keep_conversations_civil', 36), ('to_be', 32), ('it_is', 19), ('i_can', 16), ('so_i', 16), ('i_just', 13), ('i_want_to', 12), ('to_speak_to', 11), ('my_name_is', 9), ('a_lot_of', 9), ('some_of_the', 9), ('people_who_are', 8), ('we_can', 8), ('if_you_are', 7), ('to_hear_from', 7), ('feel_free_to', 6), ('email_me_at', 6), ('working_on_a', 6), ('i_would_like', 6), ('i_could', 6), ('the_rabbit_hole', 6), ('if_this_is', 6), ('we_want_to_hear', 6)]
