In [None]:
%%capture
!pip install bertopic

In [None]:
import pickle, ast
import networkx as nx
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import json

from bertopic import BERTopic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Knowledge\ Graphs\ 2022/

/content/drive/.shortcut-targets-by-id/18wZgIc4f2VuuQ34uv_vRrBAOHvMWSEaq/Knowledge Graphs 2022


In [None]:
path = "KG construction/"

with open(path+'all_entities.pickle', 'rb') as file:
    all_entities = pickle.load(file)

with open(path+'all_relations.pickle', 'rb') as file:
    all_relations = pickle.load(file)

ind2entity = {i:{v:k for v,k in enumerate(all_entities[i])} for i in all_entities}

entities = []
i = 0
for cat in ['user_ids', 'hashtags']:
  for ent in all_entities[cat]:
    entities.append(f'{i}\t{ent}')
    i += 1

relations = ['0\treplied_to', '1\thashtagged', '3\tmentioned']

big_set = set()

In [None]:
edges = [(ind2entity['user_ids'][rel[0]], ind2entity['tweet_ids'][rel[1]]) for rel in all_relations['user->tweet']] + \
        [(ind2entity['tweet_ids'][rel[0]], ind2entity['user_ids'][rel[1]]) for rel in all_relations['tweet->user']]

G = nx.DiGraph(edges)

for user in all_entities['user_ids']:
  tweet_per_user = [rel[0] for rel in G.in_edges(user)]
  for tweet in tweet_per_user:
    big_set.add(f'{list(G.in_edges(tweet))[0][0]}\tmentioned\t{user}')

In [None]:
edges = [(ind2entity['tweet_ids'][rel[0]], ind2entity['tweet_ids'][rel[1]]) for rel in all_relations['tweet->tweet']]

tweet_to_user_edges = {ind2entity['tweet_ids'][rel[1]] : ind2entity['user_ids'][rel[0]] for rel in all_relations['user->tweet']}

for edge in edges:
  big_set.add(f'{tweet_to_user_edges[edge[0]]}\treplied_to\t{tweet_to_user_edges[edge[1]]}')

In [None]:
edges = [(ind2entity['user_ids'][rel[0]], ind2entity['tweet_ids'][rel[1]]) for rel in all_relations['user->tweet']] + \
        [(ind2entity['tweet_ids'][rel[0]], ind2entity['hashtags'][rel[1]]) for rel in all_relations['tweet->hashtag']]

G = nx.DiGraph(edges)

for hashtag in all_entities['hashtags']:
  tweet_per_hashtag = [rel[0] for rel in G.in_edges(hashtag)]
  for tweet in tweet_per_hashtag:
    big_set.add(f'{list(G.in_edges(tweet))[0][0]}\thashtagged\t{hashtag}')

In [None]:
all_relations = [relation.split('\t') for relation in big_set]
print(len(all_relations))
print(len(big_set))

732373
732373


In [None]:
relation_types = {'replied_to': 0, 'mentioned': 1, 'hashtagged': 2, 'discussed': 3}
node_types = {'user': 0, 'entity': 1, 'hashtag': 2, 'topic': 3}

In [None]:
links = []
for relation in all_relations:
  links.append({"source": relation[0], "target": relation[2], "group": relation_types[relation[1]]})

In [None]:
with open("bot detection/systematic_bot_users.list", 'r') as f:
  bot_users = [int(x) for x in f.read().split('\n')[:-1]]

In [None]:
all_tweets_df = pd.read_csv('data/tweets_w_recovered_topics.csv')
users = set(all_tweets_df['user_id'])
entity = set(all_entities['user_ids']).difference(users)
users = users.difference(bot_users)
print(len(users))
print(len(entity))

11451
257131


In [None]:
tweets_w_interaction = pd.read_csv('social network/tweets_w_interaction.csv')
tweets_w_interaction['interaction'] = tweets_w_interaction['interaction'].apply(lambda x: ast.literal_eval(x))
interaction_entities = [x for interactions in list(tweets_w_interaction['interaction']) for x in interactions]
filtered_interaction_entities = [id for (id, freq) in Counter(interaction_entities).items() if freq > 10]
print(len(filtered_interaction_entities))

16034


In [None]:
with open("social network/top_comm_users.list", 'r') as f:
  total_users = [int(x) for x in f.read().split('\n')[:-1]]

users = users.intersection(total_users)
filtered_interaction_entities = set(filtered_interaction_entities).intersection(total_users)
print(len(users))
print(len(filtered_interaction_entities))

3014
2494


In [None]:
nodes = []
for user in tqdm(all_entities['user_ids'], total=len(all_entities['user_ids'])):
  if user in users:
    nodes.append({"id": str(user), "group": node_types["user"]})
  elif user in filtered_interaction_entities:
    nodes.append({"id": str(user), "group": node_types["entity"]})

100%|██████████| 268663/268663 [00:00<00:00, 1842327.84it/s]


In [None]:
print(len(nodes))

5368


In [None]:
all_tweets_df['hashtags'] = all_tweets_df['hashtags'].apply(lambda x: ast.literal_eval(x))
raw_hashtags = [x for hastags in list(all_tweets_df['hashtags']) for x in hastags]
filtered_hashtags = [id for (id, freq) in Counter(raw_hashtags).items() if freq > 10]
print(len(filtered_hashtags))

3607


In [None]:
for tag in tqdm(all_entities['hashtags'], total=len(all_entities['hashtags'])):
  if tag in filtered_hashtags:
    nodes.append({"id": tag, "group": node_types["hashtag"]})

100%|██████████| 66167/66167 [00:03<00:00, 17164.72it/s]


In [None]:
print(len(nodes))

8950


In [None]:
filtered_entities = [node['id'] for node in nodes]
print(len(filtered_entities))

8950


In [None]:
filtered_links = []
filtered_again_hashtags = []
for link in tqdm(links, total=len(links)):
  if (link['source'] in filtered_entities) and (link['target'] in filtered_entities):
    filtered_links.append(link)
    if link['group'] == 2:
      filtered_again_hashtags.append(link['target'])
    
print(len(filtered_links))

  5%|▍         | 33094/732373 [00:05<01:56, 5988.55it/s]

In [None]:
filtered_again_hashtags = set(filtered_again_hashtags)
print(len(filtered_again_hashtags))

In [None]:
nodes = []
for user in tqdm(all_entities['user_ids'], total=len(all_entities['user_ids'])):
  if user in users:
    nodes.append({"id": str(user), "group": str(node_types["user"]), 'label': ''})
  elif user in filtered_interaction_entities:
    nodes.append({"id": str(user), "group": str(node_types["entity"]), 'label': ''})

for tag in tqdm(all_entities['hashtags'], total=len(all_entities['hashtags'])):
  if tag in filtered_again_hashtags:
    nodes.append({"id": str(tag), "group": str(node_types["hashtag"]), 'label': f'#{tag}'})

In [None]:
print(len(nodes))

In [None]:
replied_to_relations = []
mentioned_relations = []
hashtagged_relations = []

for link in filtered_links:
  if link['group'] == relation_types['replied_to']:
    replied_to_relations.append({'source': str(link['source']), 'target': str(link['target'])})
  elif link['group'] == relation_types['mentioned']:
    mentioned_relations.append({'source': str(link['source']), 'target': str(link['target'])})
  elif link['group'] == relation_types['hashtagged']:
    hashtagged_relations.append({'source': str(link['source']), 'target': str(link['target'])})

print(len(replied_to_relations))
print(len(mentioned_relations))
print(len(hashtagged_relations))

In [None]:
replied_to_df = pd.DataFrame.from_records(replied_to_relations)
mentioned_df = pd.DataFrame.from_records(mentioned_relations)
hashtagged_df = pd.DataFrame.from_records(hashtagged_relations)

In [None]:
topic_model = BERTopic.load("topic modeling/qanon_new_40_topics.model")
# merge: 8 + 39, 13 + 23 + 24 + 40 + 41, -1, 2, 21, 28, 38
topic_ids = list(set(np.arange(50)).difference({39, 23, 24, 40, 41, 2, 21, 28, 38}))
idx2topic = {i: topic_ids[i] for i in range(len(topic_ids))}

topic_model.custom_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=None, separator=' | ')
topic_labels = topic_model.custom_labels[1:]

meaningful_topics = [0, 4, 6, 7, 10, 12, 14, 15, 17, 19, 22, 25, 26, 29, 31, 32, 33, 35, 36, 42, 43, 44]

In [None]:
tweets_w_topics = pd.read_csv('data/tweets_w_recovered_topics.csv')
discussed_df = tweets_w_topics[(tweets_w_topics['user_id'].isin(users)) & (tweets_w_topics['topic'].isin(meaningful_topics))][['user_id', 'topic']].drop_duplicates(keep='first')
discussed_df['topic_label'] = discussed_df['topic'].apply(lambda x: topic_labels[x])
discussed_df = discussed_df.reset_index()[['user_id', 'topic_label']]
discussed_df.columns = ['source', 'target']
discussed_df

In [None]:
for topic_id in meaningful_topics:
  nodes.append({"id": str(topic_labels[topic_id]), "group": str(node_types['topic']), "label": str(topic_labels[topic_id])})

print(len(nodes))

In [None]:
entity_df = pd.DataFrame.from_records(nodes)

In [None]:
output_folder = "KG construction/csv files/largest_community/"
replied_to_df.to_csv(output_folder + "replied_to_relations.csv", index=False)
mentioned_df.to_csv(output_folder + "mentioned_relations.csv", index=False)
hashtagged_df.to_csv(output_folder + "hashtagged_relations.csv", index=False)
discussed_df.to_csv(output_folder + "discussed_relations.csv", index=False)
entity_df.to_csv(output_folder + "entities.csv", index=False)

In [None]:
test = pd.read_csv(output_folder + "discussed_relations.csv")
test

Unnamed: 0,source,target
0,371356914,an idiot | she is | who cares
1,371356914,proud of | talking about | congratulations con...
2,371356914,the insurrection act | insurrection act | the ...
3,371356914,free speech | mike lindell | freedom of
4,371356914,the truth | truth truth | truth is
...,...,...
15756,4862259560,biden is | biden and | white house
15757,4862259560,public schools | high school | public school
15758,4862259560,an idiot | she is | who cares
15759,4862259560,proud of | talking about | congratulations con...


In [None]:
output_folder = "KG construction/csv files/largest_community/"
replied_to_df = pd.read_csv(output_folder + "replied_to_relations.csv")
mentioned_df = pd.read_csv(output_folder + "mentioned_relations.csv")
hashtagged_df = pd.read_csv(output_folder + "hashtagged_relations.csv")
discussed_df = pd.read_csv(output_folder + "discussed_relations.csv")
entities_df = pd.read_csv(output_folder + "entities.csv")

In [None]:
user_df = entities_df[entities_df['group'] == node_types['user']][['id']]
user_df = user_df.reset_index().drop('index', axis=1)

entity_df = entities_df[entities_df['group'] == node_types['entity']][['id']]
entity_df = entity_df.reset_index().drop('index', axis=1)

topic_df = entities_df[entities_df['group'] == node_types['topic']][['id']]
topic_df = topic_df.reset_index().drop('index', axis=1)

hashtag_df = entities_df[entities_df['group'] == node_types['hashtag']][['id']]
hashtag_df = hashtag_df.reset_index().drop('index', axis=1)

user_df.to_csv(output_folder + "users.csv")
entity_df.to_csv(output_folder + "entities.csv")
hashtag_df.to_csv(output_folder + "hashtags.csv")
topic_df.to_csv(output_folder + "topics.csv")

In [None]:
# dataset = {"nodes": nodes, "links": filtered_links}
# with open("KG construction/data.json", 'w') as f:
#   json.dump(dataset, f)