In [1]:
import json
import csv
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


with open('QAnon-posts.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

extracted_data = []
for post in data['posts']:
    if 'text' in post and 'post_metadata' in post and 'time' in post['post_metadata']:
        extracted_data.append((post['text'], post['post_metadata']['time']))

with open('extracted_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['text', 'time']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for text, time in extracted_data:
        writer.writerow({'text': text, 'time': time})


In [2]:
df = pd.read_csv('extracted_data.csv')
df_sorted = df.sort_values(by ='time')
df = df_sorted

In [7]:
def preprocess_text(tweet):
    tweet = re.sub(r'\r|\n', ' ', tweet.lower())
    tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet)
    tweet = re.sub(r'[^\x00-\x7f]', '', tweet)
    banned_list = string.punctuation
    table = str.maketrans('', '', banned_list)
    tweet = tweet.translate(table)
    tweet = ' '.join('' if ('$' in word) or ('&' in word) else word for word in tweet.split())
    tweet = re.sub(r"\s\s+", " ", tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tweet = ' '.join(tokens)
    tweet = re.sub(r'\b(\w+)((\w)\3{2,})(\w*)\b', r'\1\3\4', tweet)
    tweet = re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', tweet)
    tweet = ' '.join(tweet.split())
    tweet = re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', tweet)
    tweet = tweet.strip()
    return tweet

df['text'] = df['text'].apply(preprocess_text)

text_vals={}
num = 1
for i in range(200,df.shape[0],200):
    text_vals[num] = df['text'][i-200:i]
    num+=1


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrices = {}
for interval_num, interval_text in text_vals.items():
    tfidf_matrices[interval_num] = tfidf_vectorizer.fit_transform(interval_text)

feature_names = tfidf_vectorizer.get_feature_names_out()

common_words_per_interval = {}
for interval_num, tfidf_matrix in tfidf_matrices.items():
    max_tfidf_indices = tfidf_matrix.max(axis=0).toarray().ravel().argsort()[-10:]
    common_words_per_interval[interval_num] = [feature_names[idx] for idx in max_tfidf_indices if idx < len(feature_names)]

for interval_num, common_words in common_words_per_interval.items():
    print(f"Interval {interval_num}:")
    for word in common_words:
        print(word)
    print()


Interval 1:
boxprevent
carried
and
date
jesus
benavides
camp
huber
abolish
le

Interval 2:
average
carle
checkmarks
harbor
inaction
device
generate
civilian
dan
cast

Interval 3:
part
formed
displace
billion
minor
department
kc
bathroom
book
mm

Interval 4:
bedroom
crane
attackedalteredreformatted
illinois
herself
intimidation
ensure
adopting
anons
illicit

Interval 5:
betrayed
ever
kimmerling
captain
apology
difficult
guardian
guise
delegate
application

Interval 6:
gu
conformobey
fvey
controlled
group
buhr
financially
female
happiness
amen

Interval 7:
fact
based
sec
journalist
huber
loss
joseph
antifa
presence
betrays

Interval 8:
butler
been
nazi
project
my
offering
prince
bailout
knowing
data

Interval 9:
hospital
lisa
avoid
increasing
exhibiting
making
hillary
pit
extend
leading

Interval 10:
grant
attend
cfi
karli
barsoomian
influence
deserter
incline
authority
care

Interval 11:
msm
many
insurgent
nsa
number
control
bidens
help
admitting
flag

Interval 12:
midnight
dominance
ec

In [17]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define a function to preprocess text and extract BERT embeddings for each word
def preprocess_and_extract_embeddings(texts):
    input_ids_list = []
    word_embeddings_list = []
    for text in texts:
        # Tokenize the text and convert to token IDs
        input_ids = tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=512)
        
        # Check if the length of input_ids is less than 5
        if len(input_ids) < 5:
            continue  # Skip this text
        
        # Convert token IDs to tensor and cast to Long data type
        input_ids_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)  # Batch size 1
        
        # Forward pass through the model to get hidden states
        with torch.no_grad():
            outputs = model(input_ids_tensor)
            last_hidden_state = outputs.last_hidden_state
        
        # Get BERT embeddings for each word
        word_embeddings = last_hidden_state.squeeze(0).numpy()
        
        # Remove special tokens ([CLS] and [SEP]) and corresponding embeddings
        word_embeddings = word_embeddings[1:-1]
        
        input_ids_list.append(input_ids)
        word_embeddings_list.append(word_embeddings)
    
    # Concatenate input IDs and word embeddings
    input_ids_concatenated = np.concatenate(input_ids_list)
    word_embeddings_concatenated = np.concatenate(word_embeddings_list)
    
    return input_ids_concatenated, word_embeddings_concatenated

def identify_most_relevant_words(text, num_clusters=5):
    # Preprocess text and extract BERT embeddings for each word
    input_ids, word_embeddings = preprocess_and_extract_embeddings(text)
    
    # Apply k-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(word_embeddings)
    
    # Get centroids of each cluster
    cluster_centroids = kmeans.cluster_centers_
    
    # Map each word to its corresponding cluster label
    word_clusters = {word: label for word, label in zip(tokenizer.tokenize(' '.join(text)), cluster_labels)}  # Concatenate texts
    
    # Identify the most relevant words in each cluster (centroid)
    relevant_words = []
    for centroid in cluster_centroids:
        # Find the nearest word embedding to the centroid
        nearest_idx = ((word_embeddings - centroid) ** 2).sum(axis=1).argmin()
        nearest_word = tokenizer.convert_ids_to_tokens([input_ids[nearest_idx]])[0]
        relevant_words.append(nearest_word)
    
    return relevant_words



def identify_most_relevant_words_per_interval(intervals, num_clusters=5):
    most_relevant_words_per_interval = {}
    for interval_num, interval_text in intervals.items():
        most_relevant_words = identify_most_relevant_words(interval_text, num_clusters)
        most_relevant_words_per_interval[interval_num] = most_relevant_words
    return most_relevant_words_per_interval

most_relevant_words_per_interval = identify_most_relevant_words_per_interval(text_vals)
for interval_num, words in most_relevant_words_per_interval.items():
    print(f"Interval {interval_num}:")
    print(words)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Interval 1:
['of', 'certain', 'protect', 'surgery', 'do']
Interval 2:
['pig', 'code', '##yes', 'new', '##tan']
Interval 3:
['no', 'important', 'anyone', 'id', 'thought']
Interval 4:
['forget', 'this', 'everyone', '##gram', 'relevant']
Interval 5:
['will', 'father', 'will', 'to', 'think']
Interval 6:
['importance', 'interest', 'why', 'in', 'foundation']
Interval 7:
['rogue', 'did', 'program', 'si', 'support']
Interval 8:
['peter', 'to', 'power', '##eria', 'julie']
Interval 9:
['control', 'thinking', '##sa', '##pot', 'public']
Interval 10:
['prem', 'q', 'mil', 'there', 'do']
Interval 11:
['wa', 'q', 'of', 'projection', 'a']
Interval 12:
['we', '##rp', 'only', 'from', 'running']
Interval 13:
['to', 'the', 'q', 'eliminate', 'fe']
Interval 14:
['spin', 'before', 'food', 'what', 'evil']
Interval 15:
['##au', '##j', 'wa', 'state', 'uk']
Interval 16:
['coup', 'to', 'their', 'knowing', 'well']
Interval 17:
['session', 'directly', 'result', 'experience', 'jc']
Interval 18:
['you', 'warfare', 'se

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def lda_topic_modeling(text_data):
    # Initialize CountVectorizer to convert text data into a bag-of-words representation
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(text_data)

    # Define number of topics
    n_topics = 10  # You can adjust this value based on the optimal number of topics found during hyperparameter tuning

    # Initialize LDA model
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)

    # Fit LDA model to the data
    lda_model.fit(X)

    return lda_model

# Apply topic modeling to each time window
topic_models = {}
for window, texts in text_vals.items():
    # Preprocess text data for the current time window
    preprocessed_texts = texts.apply(preprocess_text)

    # Apply LDA topic modeling
    lda_model = lda_topic_modeling(preprocessed_texts)

    # Store the LDA model for the current time window
    topic_models[window] = lda_model


In [12]:
def get_most_relevant_topics(lda_model, vectorizer, n_words=10):
    # Get the topic-term distribution matrix
    topic_term_matrix = lda_model.components_

    # Get the feature names (words) from the CountVectorizer
    feature_names = vectorizer.get_feature_names_out()

    # Initialize a dictionary to store the most relevant topics
    relevant_topics = {}

    # Iterate through each topic
    for topic_idx, topic_terms in enumerate(topic_term_matrix):
        # Get the top N words for the current topic
        top_words_indices = topic_terms.argsort()[-n_words:][::-1]
        top_words = [feature_names[idx] for idx in top_words_indices]

        # Store the top words for the current topic
        relevant_topics[f'Topic {topic_idx + 1}'] = top_words

    return relevant_topics

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Apply CountVectorizer to all text data to get the bag-of-words representation
X_all = vectorizer.fit_transform(df['text'])

# Iterate through each time window and extract the most relevant topics
relevant_topics_per_window = {}
for window, texts in text_vals.items():
    # Preprocess text data for the current time window
    preprocessed_texts = texts.apply(preprocess_text)

    # Apply CountVectorizer to the preprocessed text data
    X_window = vectorizer.transform(preprocessed_texts)

    # Retrieve the trained LDA model for the current time window
    lda_model = topic_models[window]

    # Extract the most relevant topics for the current time window
    relevant_topics = get_most_relevant_topics(lda_model, vectorizer)

    # Store the relevant topics for the current time window
    relevant_topics_per_window[window] = relevant_topics

# Print the most relevant topics for each time window
for window, topics in relevant_topics_per_window.items():
    print(f"Time Window {window}:")
    for topic, top_words in topics.items():
        print(f"- {topic}: {', '.join(top_words)}")
    print()


Time Window 1:
- Topic 1: denies, admissible, chinarussia, compared, advise, clarification, calendar, crystal, blood, dem
- Topic 2: calendar, disclosuresreports, disaster, deleteinstall, demean, dilute, controlling, countered, disclosing, denies
- Topic 3: deleteinstall, clue, agent, denies, calendar, disaster, actspub, brush, disclosing, disclosuresreports
- Topic 4: deleteinstall, denies, calendar, clue, actspub, disaster, demean, disclosuresreports, aggressive, conceal
- Topic 5: denies, actspub, disguised, alwaleed, aggressive, deleteinstall, bxzja, brush, administrative, blizzard
- Topic 6: deleteinstall, disruption, denies, actspub, disaster, brush, disclosuresreports, dilute, disclosing, calendar
- Topic 7: alqaradawi, active, aag, buckle, customized, at, discovered, alarm, appellate, cripple
- Topic 8: deleteinstall, denies, disaster, actspub, collins, calendar, dem, betterment, borderwall, bought
- Topic 9: blm, disconcerting, actspub, denies, demean, advocate, clickonpreview

In [15]:
def interpret_topics(relevant_topics_per_window):
    common_crime_terms = set()  # Store common crime-related terms across all time windows

    # Iterate through each time window and interpret the topics
    for window, topics in relevant_topics_per_window.items():
        print(f"Time Window {window}:")
        crime_related_topics = []  # Store crime-related topics for the current time window

        # Print and analyze each topic
        for topic, top_words in topics.items():
            print(f"- {topic}: {', '.join(top_words)}")
            crime_terms = [word for word in top_words]
            crime_related_topics.append(topic)
            common_crime_terms.update(crime_terms)

        # Print crime-related topics for the current time window
        if crime_related_topics:
            for topic in crime_related_topics:
                print(f"  - {topic}")

        print()

    # Print common crime-related terms across all time windows
    if common_crime_terms:
        print("Common Crime-related Terms:")
        print(', '.join(common_crime_terms))
    else:
        print("No common crime-related terms found across all time windows")

# Call the function to interpret topics and identify common crime-related terms
interpret_topics(relevant_topics_per_window)


Time Window 1:
- Topic 1: denies, admissible, chinarussia, compared, advise, clarification, calendar, crystal, blood, dem
- Topic 2: calendar, disclosuresreports, disaster, deleteinstall, demean, dilute, controlling, countered, disclosing, denies
- Topic 3: deleteinstall, clue, agent, denies, calendar, disaster, actspub, brush, disclosing, disclosuresreports
- Topic 4: deleteinstall, denies, calendar, clue, actspub, disaster, demean, disclosuresreports, aggressive, conceal
- Topic 5: denies, actspub, disguised, alwaleed, aggressive, deleteinstall, bxzja, brush, administrative, blizzard
- Topic 6: deleteinstall, disruption, denies, actspub, disaster, brush, disclosuresreports, dilute, disclosing, calendar
- Topic 7: alqaradawi, active, aag, buckle, customized, at, discovered, alarm, appellate, cripple
- Topic 8: deleteinstall, denies, disaster, actspub, collins, calendar, dem, betterment, borderwall, bought
- Topic 9: blm, disconcerting, actspub, denies, demean, advocate, clickonpreview