In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

1.12.1+cu113


In [None]:
%%capture
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install torchmetrics
!pip install torchsummary

In [None]:
!pip install bertopic
!pip uninstall joblib
!pip install --upgrade joblib==1.1.0

In [None]:
import copy, pickle, ast
import networkx as nx
import numpy as np
import torch
import pandas as pd
import os.path as osp
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import GATv2Conv, Linear, to_hetero, to_hetero_with_bases, SAGEConv
from torch_geometric.data import HeteroData, InMemoryDataset, download_url
from torch_geometric.utils import negative_sampling, to_networkx
from torch_geometric.loader import GraphSAINTRandomWalkSampler, HGTLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn import Parameter, Embedding
from torchmetrics import Accuracy, F1Score
from torchsummary import summary
from bertopic import BERTopic
from collections import Counter

import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
FOLDER_PATH="/content/drive/MyDrive/Knowledge Graphs 2022/"
%cd $FOLDER_PATH

/content/drive/.shortcut-targets-by-id/18wZgIc4f2VuuQ34uv_vRrBAOHvMWSEaq/Knowledge Graphs 2022


In [None]:
with open("KG embedding/tweet_embedding.pickle", "rb") as f:
  tweet_embedding_dict = pickle.load(f)

print(len(tweet_embedding_dict.keys()))

923385


In [None]:
all_tweets_df = pd.read_csv('data/all_tweets_50_topics.csv', index_col=0)
all_tweets_df

In [None]:
topic_model = BERTopic.load("topic modeling/qanon_tweets_50_topics.model")
topic_ids = list(set(np.arange(50)))
idx2topic = {i: topic_ids[i] for i in range(len(topic_ids))}

topic_labels = topic_model.custom_labels_[1:]
topic_labels

In [None]:
topic_model.visualize_hierarchy(topics=topic_ids, custom_labels=True, color_threshold=1.35)

In [None]:
with open("KG embedding/idx2tweetId.pickle", 'rb') as f:
  idx2tweetId = pickle.load(f)

with open("KG embedding/idx2userId.pickle", 'rb') as f:
  idx2userId = pickle.load(f)

In [None]:
with open("KG embedding/dataset/data_dict_node_classification_50_topics.pickle", "rb") as f:
  data_dict = pickle.load(f)
data = HeteroData(data_dict)
data

In [None]:
data['tweet']

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
transform = T.Compose([
    T.AddSelfLoops(),
    T.ToUndirected(),
    T.ToDevice(device),
    T.RandomNodeSplit(num_val=0.1, num_test=0.2) ### split
])

transformed_data = transform(data)
transformed_data

In [None]:
# sum(transformed_data['tweet'].val_mask)
transformed_data['tweet'].idx

In [None]:
train_loader = HGTLoader(transformed_data, num_samples=[1024]*4, shuffle=True, batch_size=1024, input_nodes=('tweet', data['tweet'].train_mask), num_workers=2)
val_loader = HGTLoader(transformed_data, num_samples=[1024]*4, shuffle=True, batch_size=1024, input_nodes=('tweet', data['tweet'].val_mask), num_workers=2)
test_loader = HGTLoader(transformed_data, num_samples=[1024]*4, shuffle=True, batch_size=1024, input_nodes=('tweet', data['tweet'].test_mask), num_workers=2)
sampled_data = next(iter(train_loader))
print(sampled_data)

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GATv2Conv((-1, -1), hidden_channels*2, add_self_loops=False)
        # self.lin1 = Linear(-1, hidden_channels*2)
        self.conv2 = GATv2Conv((-1, -1), hidden_channels, add_self_loops=False)
        # self.lin2 = Linear(-1, hidden_channels)
        # self.conv3 = GATv2Conv((-1, -1), hidden_channels//2, add_self_loops=False)
        self.lin = Linear(-1, num_classes)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        # x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(self.conv2(x, edge_index), p=0.6, training=self.training)
        x = F.log_softmax(self.lin(x), dim=1)
        return x

In [None]:
num_classes = 50

In [None]:
@torch.no_grad()
def init_params(model):
  batch = next(iter(train_loader))
  batch = batch.to(device)
  model(batch.x_dict, batch.edge_index_dict)

def train(model, optimizer):
  model.train()

  total_examples = total_loss = 0
  for batch in tqdm(train_loader):
    optimizer.zero_grad()
    batch = batch.to(device)
    batch_size = batch['tweet'].batch_size
    out = model(batch.x_dict, batch.edge_index_dict)['tweet'][:batch_size]
    loss = F.nll_loss(out, batch['tweet'].y[:batch_size].argmax(dim=-1))
    loss.backward()
    optimizer.step()

    total_examples += batch_size
    total_loss += float(loss) * batch_size

  return total_loss / total_examples

@torch.no_grad()
def test(model, loader):
  model.eval()
  preds, targets = [], []
  for batch in tqdm(loader):
    batch = batch.to(device)
    batch_size = batch['tweet'].batch_size
    pred = model(batch.x_dict, batch.edge_index_dict)['tweet'][:batch_size].argmax(dim=-1)
    target = batch['tweet'].y[:batch_size].argmax(dim=-1)
    preds.append(pred)
    targets.append(target)
  
  preds = torch.concat(preds).cpu().numpy()
  targets = torch.concat(targets).cpu().numpy()

  report = classification_report(targets, preds, target_names=[topic_labels[idx2topic[k]] for k in np.arange(num_classes)], digits=4, output_dict=True)
  matrix = confusion_matrix(targets, preds)
  acc = matrix.diagonal()/matrix.sum(axis=1)
  acc = {topic_labels[idx2topic[k]]: acc[k] for k in np.arange(num_classes)}
  del report['weighted avg']
  del report['macro avg']
  del report['accuracy']
  for label, stat in report.items(): 
      stat['accuracy'] = acc[label]
      report[label] = stat
  return report

@torch.no_grad()
def top_k_acc(model, loader, n):
  model.eval()
  total_examples = total_acc = 0
  accuracy = Accuracy(top_k=n).to(device)
  for batch in tqdm(loader):
    batch = batch.to(device)
    batch_size = batch['tweet'].batch_size
    out = model(batch.x_dict, batch.edge_index_dict)['tweet'][:batch_size]
    target = batch['tweet'].y[:batch_size].argmax(dim=-1)
    acc = accuracy(out, target)
    total_examples += batch_size
    total_acc += acc * batch_size
  
  return total_acc / total_examples


@torch.no_grad()
def predict(model, loader, n):
  model.eval()
  preds = []
  targets = []
  tweet_indices = []
  for batch in tqdm(loader):
    batch = batch.to(device)
    batch_size = batch['tweet'].batch_size
    tweet_idx = batch['tweet'].idx[:batch_size]

    out = model(batch.x_dict, batch.edge_index_dict)['tweet'][:batch_size]
    target = batch['tweet'].y[:batch_size]

    preds.append(out)
    targets.append(target)
    tweet_indices.append(tweet_idx)
  
  preds = torch.cat(preds, dim=0)
  targets = torch.cat(targets, dim=0)
  tweet_indices = torch.cat(tweet_indices)
  

  preds = torch.topk(preds, n, dim=1).indices
  targets = torch.topk(targets, 1, dim=1).indices
  pred_labels = [[topic_labels[idx2topic[idx.item()]] for idx in pred] for pred in preds]
  target_labels = [[topic_labels[idx2topic[idx.item()]] for idx in target] for target in targets]

  tweet_ids = [idx2tweetId[int(idx)] for idx in tweet_indices]

  return pred_labels, target_labels, tweet_ids

In [None]:
writer = SummaryWriter('KG embedding/runs/node classification/09-12-13:50')

In [None]:
#@title
def train_gnn(config):
  acc = []
  model = to_hetero_with_bases(GNN(hidden_channels=64, num_classes=num_classes), data.metadata(), num_bases=2).to(device)
  init_params(model)
  optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
  for epoch in range(1, config["epoch"]+1):
    loss = train(model, optimizer)
    train_acc = top_k_acc(model, train_loader, 1)
    val_acc = top_k_acc(model, val_loader, 1)
    top_3_acc = top_k_acc(model, val_loader, 3)
    top_5_acc = top_k_acc(model, val_loader, 5)
    writer.add_scalar('training loss', loss, epoch)
    writer.add_scalar('training accuracy', train_acc, epoch)
    writer.add_scalar('validation accuracy', val_acc, epoch)
    writer.add_scalar('top 3 accuracy', top_3_acc, epoch)
    writer.add_scalar('top 5 accuracy', top_5_acc, epoch)

    print(f"Epoch {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Top 3 Acc: {top_3_acc:.4f}, Top 5 Acc: {top_5_acc:.4f}")
    acc.append(val_acc)
    
  return acc

In [None]:
accs = []
for epoch in [10, 20]:
  accs_epoch = []
  for lr in np.linspace(0.001, 0.01, 8):
    print(f"INFO: current hyperparameters: lr={lr}, epoch={epoch}")
    acc = train_gnn({"lr": lr, "epoch": epoch})
    accs_epoch.append(torch.stack(acc).cpu().numpy())
    print(f"INFO: the average accuracy: {torch.mean(torch.stack(acc).cpu())}")
    print()
  accs.append(accs_epoch)


In [None]:
plt.figure(figsize=(20, 8))
for j in range(2):
  plt.subplot(1, 2, j+1)
  for i in range(8):
    plt.plot([x for x in accs[j][i]], label=np.linspace(0.001, 0.01, 8)[i], linestyle='-.')
  plt.legend()

In [None]:
writer = SummaryWriter('KG embedding/runs/node classification/09-12-15:02')
model = to_hetero_with_bases(GNN(hidden_channels=64, num_classes=num_classes), data.metadata(), num_bases=2).to(device)
init_params(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

accs = []
top_3_accs = []
top_5_accs = []
for epoch in range(1, 20):
  loss = train(model, optimizer)
  train_acc = top_k_acc(model, train_loader, 1)
  val_acc = top_k_acc(model, val_loader, 1)
  top_3_acc = top_k_acc(model, val_loader, 3)
  top_5_acc = top_k_acc(model, val_loader, 5)
  accs.append(val_acc.item())
  top_3_accs.append(top_3_acc.item())
  top_5_accs.append(top_5_acc.item())
  writer.add_scalar('training loss', loss, epoch)
  writer.add_scalar('training accuracy', train_acc, epoch)
  writer.add_scalar('validation accuracy', val_acc, epoch)
  writer.add_scalar('top 3 accuracy', top_3_acc, epoch)
  writer.add_scalar('top 5 accuracy', top_5_acc, epoch)

  print(f"Epoch {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Top 3 Acc: {top_3_acc:.4f}, Top 5 Acc: {top_5_acc:.4f}")

In [None]:
print(top_k_acc(model, test_loader, 1))
print(top_k_acc(model, test_loader, 3))
print(top_k_acc(model, test_loader, 5))

In [None]:
reports = []
for i in range(10):
  report = test(model, test_loader)
  reports.append(report)

avg_report = {}
for topic in reports[0].keys():
  avg_report[topic] = {'accuracy': [], 'f1-score': [], 'precision': [], 'recall': [], 'support': []}
  for report in reports:
    for metric in avg_report[topic].keys():
      avg_report[topic][metric].append(report[topic][metric])

for topic in avg_report:
  for metric in ['accuracy', 'f1-score', 'precision', 'recall', 'support']:
    avg_report[topic][metric] = sum(avg_report[topic][metric])/10

pd.DataFrame.from_dict(avg_report)

In [None]:
avg_acc = [metric['accuracy'] for _, metric in avg_report.items()]
avg_f1 = [metric['f1-score'] for _, metric in avg_report.items()]
barwidth = 0.4
br = np.arange(1, len(avg_acc)+1)
plt.figure(figsize=(16, 8))
plt.bar(br, avg_acc, width=barwidth, label='Accuracy')
br2 = [x + barwidth for x in br]
plt.bar(br2, avg_f1, width=barwidth, label='F1-score')
plt.xticks(br, rotation=45)
plt.hlines(y=0.7, xmin=0.5, xmax=num_classes+1.5, color='r', ls='--')
plt.legend()

In [None]:
accurate_topics = []
for topic, metrics in avg_report.items():
  if (avg_report[topic]['f1-score'] >= 0.7):
      accurate_topics.append(topic)

accurate_topics

In [None]:
torch.save(model.state_dict(), "KG embedding/ndoe_class_50_model_weights_9_12.pth")

In [None]:
writer.flush()
writer.close()

In [None]:
# !pip3 install tensorboard
%load_ext tensorboard

In [None]:
%tensorboard --logdir=/content/drive/MyDrive/Knowledge\ Graphs\ 2022/KG\ embedding/runs/node\ classification/09-12-15:02

In [None]:
preds, targets, tweet_ids = predict(model, test_loader, 3)

In [None]:
for idx in range(len(tweet_ids)):
  tweet = all_tweets_df[all_tweets_df.index == tweet_ids[idx]]
  if (tweet['topic_probability'].values[0] >= 0.5) and (targets[idx] == [topic_labels[0]]):
    print(f"tweet: {tweet['text'].values[0]}")
    print(f"prediction: {preds[idx]}")
    print(f"ground truth: {targets[idx]}")
    print()

In [None]:
with open('bot detection/potential_bot_users.list', 'r') as f:
  bot_user_list = [int(x) for x in f.read().split('\n')[:-1]]

bot_tweets_id_list = all_tweets_df[all_tweets_df['user_id'].isin(bot_user_list)].index.tolist()
all_tweets_df = all_tweets_df[~all_tweets_df.index.isin(bot_tweets_id_list)]

### Prediction

In [None]:
# model.load_state_dict(torch.load("KG embedding/ndoe_class_50_model_weights_9_12.pth"))
# model.eval()

In [None]:
# accurate_topics = ['the vaccine|get vaccinated|to get',
#  'yes yes|yes yes yes|yes he',
#  'titus ray|great idea|ray thrillers',
#  'the virus|to china|the chinese',
#  'wear mask|to wear|the cdc',
#  'stories via|press is out|press is',
#  'fake news|just completed minutes|meditation with',
#  'an idiot|she is|who cares',
#  'just posted|just posted photo|posted photo']

In [None]:
with open("KG embedding/idx2tweetId_prediction.pickle", 'rb') as f:
  idx2tweetId = pickle.load(f)

with open("KG embedding/idx2userId_prediction.pickle", 'rb') as f:
  idx2userId = pickle.load(f)

In [None]:
with open("KG embedding/dataset/data_dict_node_classification_50_topics_prediction.pickle", "rb") as f:
  data_dict = pickle.load(f)
data = HeteroData(data_dict)
data

In [None]:
data['tweet']

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
transform = T.Compose([
    T.AddSelfLoops(),
    T.ToUndirected(),
    T.ToDevice(device)
])

data = transform(data)

data["tweet"].test_mask = torch.ones(data["tweet"].idx.shape[0]).bool().to(device)
data

In [None]:
predict_data_loader = HGTLoader(data, num_samples=[1024]*4, shuffle=True, batch_size=1024, input_nodes=('tweet', data['tweet'].test_mask), num_workers=2)
sampled_data = next(iter(predict_data_loader))
print(sampled_data)

In [None]:
preds, targets, tweet_ids = predict(model, predict_data_loader, 1)

100%|██████████| 902/902 [02:22<00:00,  6.31it/s]


In [None]:
tweets_no_topics = all_tweets_df[all_tweets_df['topic'] == -1]
tweets_no_topics

In [None]:
recovered_tweets = {}
for idx in tqdm(range(len(tweet_ids)), total=len(tweet_ids)):
  tweet = tweets_no_topics[tweets_no_topics.index == tweet_ids[idx]]
  if (len(tweet) > 0) and (preds[idx][0] in accurate_topics):
    recovered_tweets[tweet_ids[idx]] = topic_labels.index(preds[idx][0])

In [None]:
Counter(recovered_tweets.values())

In [None]:
topic_labels = ['the election|voter fraud|to vote',
 'wear mask|to wear|wearing mask',
 'yes yes|yes yes yes|yes he',
 'yasss its|yasss its time|its time for',
 'the truth|truth is|thank you',
 'the follow|for the follow|biz get paid',
 'to win|gun control|just entered']
for i, id in enumerate([0, 13, 24, 31, 39, 44, 48]):
  print(f"Topic with keywords: {topic_labels[i]}")
  print(f"before completion: {len(all_tweets_df[all_tweets_df['topic'] == id])}")
  print(f"after completion: {len(all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['topic'] == id])}")

In [None]:
print(len(all_tweets_df[all_tweets_df['topic'] != -1])/len(all_tweets_df))
print(len(all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['topic'] != -1])/len(all_tweets_df))
print(len(all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['topic'] != -1]) - len(all_tweets_df[all_tweets_df['topic'] != -1]))

In [None]:
len(recovered_tweets.values())

In [None]:
for id, row in tqdm(all_tweets_df.iterrows(), total=len(all_tweets_df)):
  if id in recovered_tweets:
    all_tweets_df.at[id, 'topic'] = recovered_tweets[id]
    

In [None]:
all_tweets_df[all_tweets_df['topic'] == -1]

In [None]:
all_tweets_df = all_tweets_df.drop("topic_probability", axis=1)
all_tweets_df.to_csv("data/tweets_w_recovered_50_topics.csv")

In [None]:
selected_tweets = []
for idx in tqdm(range(len(tweet_ids)), total=len(tweet_ids)):
  tweet = all_tweets_df[all_tweets_df.index == tweet_ids[idx]]
  if (topic_labels[0] in preds[idx]):
    # print(f"tweet: {tweet['text'].values[0]}")
    # print(f"prediction: {preds[idx]}")
    # print()
    selected_tweets.append(tweet_ids[idx])

print(len(selected_tweets))

In [None]:
selected_df = all_tweets_df[all_tweets_df.index.isin(selected_tweets)]
selected_df = selected_df.fillna('')

In [None]:
all_tweets_df_recovered_topics = pd.read_csv("data/tweets_w_recovered_50_topics.csv", index_col=0)
all_tweets_df_recovered_topics

In [None]:
labels = list(np.arange(50))
labels.insert(0, -1)
counts = []
topic_ids = list(set(all_tweets_df['topic']))
topic_ids = [topic_ids[-1]] + topic_ids[:-1]
for topic in topic_ids:
  counts.append(len(all_tweets_df[all_tweets_df['topic'] == topic]))

plt.rcParams['font.size'] = '30'
fig1, ax1 = plt.subplots(figsize=(10, 10))
labels = ['topic -1', 'topic 1 ~ 50']
counts = [counts[0], sum(counts[1:])]
ax1.pie(counts, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
n_total_topics = len(all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['topic'] != -1])
n_original_topics = len(all_tweets_df[all_tweets_df['topic'] != -1])
n_topics = len(all_tweets_df)

In [None]:
fig1, ax1 = plt.subplots(figsize=(10, 10))
new_counts = [n_topics-n_total_topics, n_original_topics, n_total_topics-n_original_topics]
new_labels = ['topic -1', 'original', 'completed']
ax1.pie(new_counts, labels=new_labels, autopct='%1.1f%%',
        shadow=False, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
all_tweets_df = pd.read_csv('data/all_tweets_50_topics.csv', index_col=0)
all_tweets_df

In [None]:
count = 0
for idx, row in all_tweets_df.iterrows():
  if count == 10:
    break
  if all_tweets_df.at[idx, 'topic'] == -1 and all_tweets_df_recovered_topics.at[idx, 'topic'] == 0:
    print(all_tweets_df_recovered_topics.at[idx, 'text'])
    print(topic_labels[all_tweets_df_recovered_topics.at[idx, 'topic']])
    display(all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['user_id'] == all_tweets_df_recovered_topics.at[idx, 'user_id']])
    count += 1

In [None]:
user_recovered_topics = dict()
for idx, row in tqdm(all_tweets_df_recovered_topics.iterrows(), total=len(all_tweets_df_recovered_topics)):
  if all_tweets_df.at[idx, 'topic'] == -1 and all_tweets_df_recovered_topics.at[idx, 'topic'] != -1:
    user = all_tweets_df.at[idx, 'user_id']
    if user in user_recovered_topics:
      user_recovered_topics[user] += 1
    else:
      user_recovered_topics[user] = 1

###### example 1

In [None]:
sorted_user_recovered_topics = sorted(user_recovered_topics.items(), key=lambda x: x[1], reverse=True)

with open("bot detection/potential_bot_users.list", 'r') as f:
  bot_users = [int(x) for x in f.read().split('\n')[:-1]]

sorted_user_recovered_topics = [(user, frequency) for user, frequency in sorted_user_recovered_topics if user not in bot_users]

filtered_users = [(user, frequency) for user, frequency in sorted_user_recovered_topics if frequency <= 30]
filtered_tweets = []
for idx, row in all_tweets_df[all_tweets_df['user_id'] == filtered_users[0][0]].iterrows():
  if (row['topic'] == -1) and (all_tweets_df_recovered_topics.at[idx, 'topic'] != -1):
      filtered_tweets.append(idx)
      
test = all_tweets_df_recovered_topics[all_tweets_df_recovered_topics.index.isin(filtered_tweets)]
test['original_topic'] = all_tweets_df[all_tweets_df.index.isin(filtered_tweets)]['topic']
test

##### example 2

In [None]:
filtered_tweets = []
for idx, row in all_tweets_df[all_tweets_df['user_id'] == filtered_users[1][0]].iterrows():
  if (row['topic'] == -1) and (all_tweets_df_recovered_topics.at[idx, 'topic'] != -1):
      filtered_tweets.append(idx)
      
test = all_tweets_df_recovered_topics[all_tweets_df_recovered_topics.index.isin(filtered_tweets)]
test['original_topic'] = all_tweets_df[all_tweets_df.index.isin(filtered_tweets)]['topic']
test

##### example 3

In [None]:

filtered_users = [(user, frequency) for user, frequency in sorted_user_recovered_topics if frequency >= 30]
filtered_tweets = []
for idx, row in all_tweets_df[all_tweets_df['user_id'] == filtered_users[1][0]].iterrows():
  if (row['topic'] == -1) and (all_tweets_df_recovered_topics.at[idx, 'topic'] != -1):
      filtered_tweets.append(idx)
      
test = all_tweets_df_recovered_topics[all_tweets_df_recovered_topics.index.isin(filtered_tweets)]
test['original_topic'] = all_tweets_df[all_tweets_df.index.isin(filtered_tweets)]['topic']
test

In [None]:
print(len(all_tweets_df[all_tweets_df['topic'] == 0]))
print(len(all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['topic'] == 0]))

10559
57855


In [None]:
import re
topic_0_tweets = all_tweets_df_recovered_topics[all_tweets_df_recovered_topics['topic'] == 0]
topic_0_tweets = topic_0_tweets.fillna('')
topic_0_tweets['cleaned_text'] = topic_0_tweets.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.cleaned_text).split()), 1)
topic_0_tweets = topic_0_tweets.drop_duplicates(subset='cleaned_text')
docs = topic_0_tweets['cleaned_text'].tolist()
print(len(docs))

In [None]:
topic_model = BERTopic(language='multilingual', n_gram_range=(2, 3), min_topic_size=50, nr_topics=20, calculate_probabilities=True)
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.set_topic_labels(topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=None, separator='|'))
topic_model.visualize_hierarchy(custom_labels=True, top_n_topics=20)

In [None]:
topic_model.visualize_barchart(n_words=10, top_n_topics=20)

In [None]:
topics_over_time = topic_model.topics_over_time(docs, topic_0_tweets['time'].tolist())
topic_model.visualize_topics_over_time(topics_over_time)