In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.12.1+cu113
[K     |████████████████████████████████| 7.9 MB 2.8 MB/s 
[K     |████████████████████████████████| 3.5 MB 2.7 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd
import pickle
import os.path as osp
import torch_geometric
from torch_geometric.data import HeteroData, InMemoryDataset, download_url
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.utils import negative_sampling
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
FOLDER_PATH="/content/drive/MyDrive/Knowledge Graphs 2022/"
%cd $FOLDER_PATH

/content/drive/.shortcut-targets-by-id/18wZgIc4f2VuuQ34uv_vRrBAOHvMWSEaq/Knowledge Graphs 2022


### Load Data

In [5]:
with open("KG embedding/tweet_embedding.pickle", "rb") as f:
  tweet_embedding_dict = pickle.load(f)

with open("KG embedding/user_embedding.pickle", "rb") as f:
  user_embedding_dict = pickle.load(f)

In [6]:
with open('KG construction/all_entities.pickle', 'rb') as file:
    all_entities = pickle.load(file)

with open('KG construction/all_relations.pickle', 'rb') as file:
    all_relations = pickle.load(file)

ind2entity = {i:{v:k for v,k in enumerate(all_entities[i])} for i in all_entities}

tweet2tweet = [(ind2entity['tweet_ids'][rel[0]], ind2entity['tweet_ids'][rel[1]]) for rel in all_relations['tweet->tweet']]
user2tweet = [(ind2entity['user_ids'][rel[0]], ind2entity['tweet_ids'][rel[1]]) for rel in all_relations['user->tweet']]
tweet2user = [(ind2entity['tweet_ids'][rel[0]], ind2entity['user_ids'][rel[1]]) for rel in all_relations['tweet->user']]

In [7]:
all_tweets_df = pd.read_csv('data/all_tweets_50_topics.csv')

In [18]:
prediction_dataset_flag = True ## set to True when preparing the prediction dataset, False when preparing the training dataset

In [19]:
if prediction_dataset_flag:
  filtered_all_tweets = all_tweets_df[['id', 'topic', 'topic_probability']]
  filtered_all_tweets['topic'] = filtered_all_tweets['topic'].replace(-1, 0) ## change topic label from -1 to 0 for prediction
else:
  filtered_all_tweets = all_tweets_df[(all_tweets_df['topic'] != -1) & (all_tweets_df['topic_probability'] >= 0.5)][['id', 'topic', 'topic_probability']]

tweet2topic = list(filtered_all_tweets.to_records(index=False))
filtered_all_tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,topic,topic_probability
0,1420959115879145474,0,0.000000
1,1420589578541428739,0,0.000000
2,1420533067400105985,0,0.845134
3,1420373718828679174,0,0.000000
4,1420373378653773829,0,0.000000
...,...,...,...
923380,1397974274367963136,0,0.605857
923381,1397972983067693064,0,0.000000
923382,1397972391708475392,0,0.000000
923383,1397813255523995648,5,0.871033


### Node Classification Dataset

In [10]:
tweet_embeddings = torch.from_numpy(np.stack(list(tweet_embedding_dict.values()))).float()
# topic_embeddings = torch.from_numpy(np.stack(list(topic_embedding_dict.values()))).float()
user_embeddings = torch.from_numpy(np.stack(list(user_embedding_dict.values()))).float()

tweet_id_list = list(tweet_embedding_dict.keys())
topic_id_list = list(set(filtered_all_tweets['topic'].tolist()))
user_id_list = list(user_embedding_dict.keys())

tweetId2idx = {tweet_id_list[i]: i for i in range(len(tweet_id_list))}
topicId2idx = {topic_id_list[i]: i for i in range(len(topic_id_list))}
userId2idx = {user_id_list[i]: i for i in range(len(user_id_list))}

tweet_ids = set()
for tweet, topic, weight in tweet2topic:
  if (topic in topicId2idx) and (tweet in tweetId2idx):
      tweet_ids.add(tweet)

filtered_tweet_id_list = list(set([tweet_id for tweet_id, _, _ in tweet2topic]).intersection(tweet_ids))
tweet_embeddings = tweet_embeddings[[tweetId2idx[tweet_id] for tweet_id in filtered_tweet_id_list], :]

tweetId2idx = {filtered_tweet_id_list[i]: i for i in range(len(filtered_tweet_id_list))}


filtered_tweet2tweet = []
for tweet1, tweet2 in tweet2tweet:
  if (tweet1 in tweetId2idx) and (tweet2 in tweetId2idx):
      filtered_tweet2tweet.append([tweetId2idx[tweet1], tweetId2idx[tweet2], 1.0])

filtered_user2tweet = []
for user, tweet in user2tweet:
  if (user in userId2idx) and (tweet in tweetId2idx):
      filtered_user2tweet.append([userId2idx[user], tweetId2idx[tweet], 1.0])

filtered_tweet2user = []
for tweet, user in tweet2user:
  if (user in userId2idx) and (tweet in tweetId2idx):
      filtered_tweet2user.append([tweetId2idx[tweet], userId2idx[user], 1.0])

filtered_tweet2topic = []
for tweet, topic, weight in tweet2topic:
  if (topic in topicId2idx) and (tweet in tweetId2idx):
      filtered_tweet2topic.append([tweetId2idx[tweet], topicId2idx[topic], weight])

print(tweet_embeddings.shape)
print(user_embeddings.shape)

print(len(filtered_tweet2tweet))
print(len(filtered_user2tweet))
print(len(filtered_tweet2user))
print(len(filtered_tweet2topic))

torch.Size([138852, 300])
torch.Size([26041, 300])
179
131269
116931
138852


In [11]:
dummy_topic_embeddings = torch.ones((len(topic_id_list), 300))
dummy_topic_embeddings.shape

torch.Size([50, 300])

In [12]:
data = HeteroData()
data["tweet"].x = tweet_embeddings
data["tweet"].idx = torch.Tensor(list(tweetId2idx.values())).float()
data["topic"].x = dummy_topic_embeddings
data["topic"].idx = torch.Tensor(list(topicId2idx.values())).float()
data["user"].x = user_embeddings
data["user"].idx = torch.Tensor(list(userId2idx.values())).float()

data["tweet", "replied_to", "tweet"].edge_index = torch.from_numpy(np.array([[rel[0], rel[1]] for rel in filtered_tweet2tweet]).T)
data["user", "tweeted", "tweet"].edge_index = torch.from_numpy(np.array([[rel[0], rel[1]] for rel in filtered_user2tweet]).T)
data["tweet", "mentioned", "user"].edge_index = torch.from_numpy(np.array([[rel[0], rel[1]] for rel in filtered_tweet2user]).T)
data["tweet", "in_topic", "topic"].edge_index = torch.from_numpy(np.array([[rel[0], rel[1]] for rel in filtered_tweet2topic]).T)

data["tweet", "replied_to", "tweet"].edge_attr = torch.Tensor([[rel[2]] for rel in filtered_tweet2tweet]).float()
data["user", "tweeted", "tweet"].edge_attr = torch.Tensor([[rel[2]] for rel in filtered_user2tweet]).float()
data["tweet", "mentioned", "user"].edge_attr = torch.Tensor([[rel[2]] for rel in filtered_tweet2user]).float()
data["tweet", "in_topic", "topic"].edge_attr = torch.Tensor([[rel[2]] for rel in filtered_tweet2topic]).float()

data

HeteroData(
  [1mtweet[0m={
    x=[138852, 300],
    idx=[138852]
  },
  [1mtopic[0m={
    x=[50, 300],
    idx=[50]
  },
  [1muser[0m={
    x=[26041, 300],
    idx=[26041]
  },
  [1m(tweet, replied_to, tweet)[0m={
    edge_index=[2, 179],
    edge_attr=[179, 1]
  },
  [1m(user, tweeted, tweet)[0m={
    edge_index=[2, 131269],
    edge_attr=[131269, 1]
  },
  [1m(tweet, mentioned, user)[0m={
    edge_index=[2, 116931],
    edge_attr=[116931, 1]
  },
  [1m(tweet, in_topic, topic)[0m={
    edge_index=[2, 138852],
    edge_attr=[138852, 1]
  }
)

In [13]:
data = T.LargestConnectedComponents()(data.to_homogeneous()).to_heterogeneous()
data

HeteroData(
  [1mtweet[0m={
    x=[138852, 300],
    idx=[138852]
  },
  [1mtopic[0m={
    x=[50, 300],
    idx=[50]
  },
  [1muser[0m={
    x=[23381, 300],
    idx=[23381]
  },
  [1m(tweet, replied_to, tweet)[0m={
    edge_index=[2, 179],
    edge_attr=[179, 1]
  },
  [1m(user, tweeted, tweet)[0m={
    edge_index=[2, 131269],
    edge_attr=[131269, 1]
  },
  [1m(tweet, mentioned, user)[0m={
    edge_index=[2, 116931],
    edge_attr=[116931, 1]
  },
  [1m(tweet, in_topic, topic)[0m={
    edge_index=[2, 138852],
    edge_attr=[138852, 1]
  }
)

In [14]:
edge_indices = data['tweet', 'topic'].edge_index
labels = torch.zeros(data['tweet'].x.shape[0], dtype=torch.int64)
for i in range(len(labels)):
  labels[edge_indices[0][i]] = edge_indices[1][i]

labels = F.one_hot(labels).float()
data['tweet'].y = labels
del data['tweet', 'topic']
del data['topic']
data

HeteroData(
  [1mtweet[0m={
    x=[138852, 300],
    idx=[138852],
    y=[138852, 50]
  },
  [1muser[0m={
    x=[23381, 300],
    idx=[23381]
  },
  [1m(tweet, replied_to, tweet)[0m={
    edge_index=[2, 179],
    edge_attr=[179, 1]
  },
  [1m(user, tweeted, tweet)[0m={
    edge_index=[2, 131269],
    edge_attr=[131269, 1]
  },
  [1m(tweet, mentioned, user)[0m={
    edge_index=[2, 116931],
    edge_attr=[116931, 1]
  }
)

In [15]:
data_dict = data.to_dict()
prediction_suffix = '_prediction' if prediction_dataset_flag else ''
dataset_file_name = f"KG embedding/dataset/data_dict_node_classification_50_topics{prediction_suffix}.pickle"
with open(dataset_file_name, "wb") as f:
  pickle.dump(data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(dataset_file_name, "rb") as f:
  data_dict = pickle.load(f)
data = HeteroData(data_dict)
print(f"dataset filename is {dataset_file_name}")
data

dataset filename is KG embedding/dataset/data_dict_node_classification_50_topics.pickle


HeteroData(
  [1mtweet[0m={
    x=[138852, 300],
    idx=[138852],
    y=[138852, 50]
  },
  [1muser[0m={
    x=[23381, 300],
    idx=[23381]
  },
  [1m(tweet, replied_to, tweet)[0m={
    edge_index=[2, 179],
    edge_attr=[179, 1]
  },
  [1m(user, tweeted, tweet)[0m={
    edge_index=[2, 131269],
    edge_attr=[131269, 1]
  },
  [1m(tweet, mentioned, user)[0m={
    edge_index=[2, 116931],
    edge_attr=[116931, 1]
  }
)

In [None]:
idx2tweetId = {idx: tweet_id for tweet_id, idx in tweetId2idx.items()}
idx2userId = {idx: user_id for user_id, idx in userId2idx.items()}

with open(f"KG embedding/idx2tweetId{prediction_suffix}.pickle", 'wb') as f:
  pickle.dump(idx2tweetId, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(f"KG embedding/idx2userId{prediction_suffix}.pickle", 'wb') as f:
  pickle.dump(idx2userId, f, protocol=pickle.HIGHEST_PROTOCOL)