In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
!pip install dgl
!pip install torch_geometric
!pip install networkx==2.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import os
import re
import string
import networkx as nx
import nltk

import tqdm
import en_core_web_lg
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

import spacy
from time import time
import dgl
from scipy import sparse


import torch
from torch_geometric.data import Data, HeteroData
from datetime import datetime


DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Load tweet file

In [None]:
p_part1 = '68841_tweets_multiclasses_filtered_0722_part1.npy'
p_part2 = '68841_tweets_multiclasses_filtered_0722_part2.npy'


# Concatenating The Dataset

In [None]:
df_np_part1 = np.load(p_part1, allow_pickle=True)
df_np_part2 = np.load(p_part2, allow_pickle=True)
df_np = np.concatenate((df_np_part1, df_np_part2), axis = 0)
print("[INFO] Data Loaded...")


df = pd.DataFrame(data=df_np, columns=["event_id", "tweet_id", "text", "user_id", "created_at", "user_loc",\
    "place_type", "place_full_name", "place_country_code", "hashtags", "user_mentions", "image_urls", "entities", 
    "words", "filtered_words", "sampled_words"])

print("[INFO] Data converted to dataframe...")



FileNotFoundError: ignored

In [None]:
print(df.shape)

In [None]:
df.head(10)

In [None]:
df.text = df.text.str.strip()


In [None]:
df["text"]

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

# Defining preprocessing functions for TEXT

In [None]:

def clean_text(text):
    # Convert text to lowercase
    text = str(text).lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    return text

def tokenize_text(text):
    # Tokenize text using TweetTokenizer
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def lemmatize_tokens(tokens):
    # Lemmatize tokens using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas

def remove_stopwords(tokens):
    # Remove stop words using NLTK's English stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [None]:
# Apply preprocessing to the text column
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].apply(tokenize_text)
df['text'] = df['text'].apply(lemmatize_tokens)
df['text'] = df['text'].apply(remove_stopwords)


In [None]:
df.head()

# Convert Date and Time in Numeric

In [None]:
# sort data in DATAFRAME 1 by time
df = df.sort_values(by='created_at').reset_index()
# append date
df['date'] = [d.date() for d in df['created_at']]


In [None]:
# df['created_at'] = pd.to_datetime(df['created_at'])
#
# # convert datetime column to numeric format
# df['created_at'] = pd.to_numeric(df['created_at'].dt.strftime('%Y%m%d%H%M%S'))

In [None]:
df.head()

# Load message file

In [None]:
load_path = 'all_df_words_ents_mids.npy'

In [None]:
df_np = np.load(load_path, allow_pickle=True)

print("[INFO] Data Loaded...")

df1 = pd.DataFrame(data=df_np, \
    columns=['document_ids', 'sentence_ids', 'sentences', 'event_type_ids', 'words', 'unique_words', 'entities', 'message_ids'])

print("[INFO] Data converted to dataframe...")


In [None]:
print(df1.shape)

In [None]:
df1.head()

# Preprocessing

In [None]:
df1.isnull().sum()

In [None]:
df1.dtypes

# Convert ids into Numeric

In [None]:
# Create a dictionary that maps unique string values of 'document_ids' to integers
doc_id_map = {  doc_id:i for i, doc_id in enumerate(df1['document_ids'].unique())}

# Map the 'document_ids' column to integer values
df1['document_ids'] = df1['document_ids'].map(doc_id_map)

# Convert the 'document_ids' column to integer type
df1['document_ids'] = df1['document_ids'].astype(int)

In [None]:
# Convert the 'sentence_ids' column to integer type
df1['sentence_ids'] = df1['sentence_ids'].astype(int)

In [None]:
# Create a dictionary that maps unique string values of 'sentence_ids' to integers
msg_id_map = {msg_id:i for i, msg_id in enumerate(df1['message_ids'].unique())}

# Map the 'sentence_ids' column to integer values
df1['message_ids'] = df1['message_ids'].map(msg_id_map)

# Convert the 'sentence_ids' column to integer type
df1['message_ids'] = df1['message_ids'].astype(int)

In [None]:
# Apply preprocessing to the text column
df1['sentences'] = df1['sentences'].apply(clean_text)
df1['sentences'] = df1['sentences'].apply(tokenize_text)
df1['sentences'] = df1['sentences'].apply(lemmatize_tokens)
df1['sentences'] = df1['sentences'].apply(remove_stopwords)


In [None]:
df1.head()

# FILTERING TWEETS

In [None]:
df.tail()

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

# Check if a GPU is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained sentiment analysis model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# Create the sentiment analysis pipeline
sentiment_analyzer = TextClassificationPipeline(model=model, tokenizer=tokenizer, task='sentiment-analysis')

In [None]:
sentiment_analyzer(df.text[68839])[0]['score']

In [None]:
messages =  df.text

In [None]:
# Compute the sentiment scores for each message
batch_size = 64
sentiment_scores = []
for i in tqdm.tqdm(range(0, len(messages), batch_size)):
    batch = messages[i:i + batch_size].apply(str).tolist()
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1).cpu().numpy()

    for probability in probabilities:
        sentiment_scores.append(probability[1])  # Get the score for the positive sentiment class


# Add sentiment scores to the DataFrame
df['sentiment_scores'] = sentiment_scores

In [None]:
# df.to_csv("processed_tweets_data.csv", index=False)

In [None]:
new_DF = pd.read_csv("processed_tweets_data.csv")
new_DF['sentiment_scores'].tail()

In [None]:
threshold = 0.5

filtered_df = new_DF[new_DF['sentiment_scores'] >= threshold]
filtered_df.shape

In [None]:
print("TOTAL MESSAGES: ", len(new_DF))
print("FILTERED MESSAGES:", len(filtered_df))

In [None]:
# # Set batch size and preallocate list for sentiment scores
# batch_size = 256
# sentiment_labels = []

# # Process messages in batches
# for i in tqdm.tqdm(range(0, len(messages), batch_size)):
#     batch = messages[i:i + batch_size].apply(str).tolist()
#     results = sentiment_analyzer(batch)
#     for result in results:
#         sentiment_labels.append(result['label'])

# # Add sentiment scores to the DataFrame
# df['sentiment_labels'] = sentiment_labels

In [None]:
filtered_df.drop(['sentiment_scores'], axis=1, inplace=True)

# Generating the initial message features

In [None]:
'''
The embedding of each document is the average of the pre-trained embeddings of all the words in it.
'''

def get_document_embeddings(df):
    # Load the pre-trained English language model from Spacy library
    nlp = en_core_web_lg.load()
    # Apply the language model on each document and calculate its vector representation
    embeddings = df.filtered_words.apply(lambda x: nlp(' '.join(x)).vector).values
    # Stack all the embeddings into a numpy array and return it
    return np.stack(embeddings, axis=0)



'''
This function encodes a timestamp string in the format '2012-10-11 07:19:34' into a list of two time features.
'''

def extract_time_features(timestamp_str):
    # Convert the timestamp string to a datetime object
    timestamp = datetime.fromisoformat(str(timestamp_str))
    # Define a datetime object to represent the zero time for OLE time
    OLE_TIME_ZERO = datetime(1899, 12, 30)
    # Calculate the time difference between the timestamp and the OLE time zero
    delta = timestamp - OLE_TIME_ZERO
    # Calculate the time features by normalizing the time difference into fractions of a day
    time_features = [(float(delta.days) / 100000.), (float(delta.seconds) / 86400)] # 86,400 seconds in a day
    # Return the time features as a list
    return time_features

'''
This function encodes the timestamps of all the messages in the dataframe into a numpy array of time features.
'''
def get_time_features(df):
    # Apply the extract_time_features function on each timestamp string in the dataframe
    time_features = np.asarray([extract_time_features(timestamp_str) for timestamp_str in df['created_at']])
    # Return the time features as a numpy array
    return time_features

In [None]:
filtered_df.head()

In [None]:
document_features = get_document_embeddings(df)
print("Document features generated.")

document_features[0]

In [None]:
document_features[:10]

In [None]:
document_features.shape

In [None]:
time_features = get_time_features(df)

print("Time features generated.")

In [None]:
time_features[:10]

In [None]:
time_features.shape

In [None]:
# Concatenate the document features and time features into a single numpy array
combined_features = np.concatenate((document_features, time_features), axis=1)
print("Concatenated document features and time features.")
combined_features.shape

In [None]:
# Save the concatenated features as a numpy array file
save_file_path =  'Spacy_Full_features_69612_0709_spacy_lg_zero_multiclasses_filtered.npy'
np.save(save_file_path, combined_features)
print("Initial features saved.")


In [None]:
# Load the concatenated features from the saved numpy array file
loaded_features = np.load(save_file_path)
print("Initial features loaded.")
print(loaded_features.shape)

# CONSTRUCTING GRAPHS

In [None]:
def construct_graph_from_df(df, G=None):
    if G is None:
        G = nx.Graph()
    for _, row in df.iterrows():
        tid = 't_' + str(row['tweet_id'])
        G.add_node(tid)
        G.nodes[tid]['tweet_id'] = True  # right-hand side value is irrelevant for the lookup

        user_ids = row['user_mentions']
        user_ids  = list(user_ids)
        user_ids.append(row['user_id'])
        user_ids = ['u_' + str(each) for each in user_ids]
        # print(user_ids)
        G.add_nodes_from(user_ids)
        for each in user_ids:
            G._node[each]['user_id'] = True

        entities = row['entities']
        # entities = ['e_' + each for each in entities]
        # print(entities)
        G.add_nodes_from(entities)
        for each in entities:
            G._node[each]['entity'] = True

        words = row['sampled_words']
        words = ['w_' + each for each in words]
        # print(words)
        G.add_nodes_from(words)
        for each in words:
            G._node[each]['word'] = True

        edges = []
        edges += [(tid, each) for each in user_ids]
        edges += [(tid, each) for each in entities]
        edges += [(tid, each) for each in words]
        G.add_edges_from(edges)

    return G


In [None]:
# convert a heterogeneous social graph G to a homogeneous message graph following eq. 1 of the paper,
# and store the sparse binary adjacency matrix of the homogeneous message graph.


def to_dgl_graph_v3(G, save_path=None):
    message = ''
    print('Start converting heterogeneous networkx graph to homogeneous dgl graph.')
    message += 'Start converting heterogeneous networkx graph to homogeneous dgl graph.\n'
    all_start = time()

    print('\tGetting a list of all nodes ...')
    message += '\tGetting a list of all nodes ...\n'
    start = time()
    all_nodes = list(G.nodes)
    mins = (time() - start) / 60
    print('\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # print('All nodes: ', all_nodes)
    # print('Total number of nodes: ', len(all_nodes))

    print('\tGetting adjacency matrix ...')
    message += '\tGetting adjacency matrix ...\n'
    start = time()
    A = nx.to_numpy_matrix(G)  # Returns the graph adjacency matrix as a NumPy matrix.
    mins = (time() - start) / 60
    print('\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # compute commuting matrices
    print('\tGetting lists of nodes of various types ...')
    message += '\tGetting lists of nodes of various types ...\n'
    start = time()
    tid_nodes = list(nx.get_node_attributes(G, 'tweet_id').keys())
    userid_nodes = list(nx.get_node_attributes(G, 'user_id').keys())
    word_nodes = list(nx.get_node_attributes(G, 'word').keys())
    entity_nodes = list(nx.get_node_attributes(G, 'entity').keys())
    del G
    mins = (time() - start) / 60
    print('\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\tConverting node lists to index lists ...')
    message += '\tConverting node lists to index lists ...\n'
    start = time()
    #  find the index of target nodes in the list of all_nodes
    indices_tid = [all_nodes.index(x) for x in tid_nodes]
    indices_userid = [all_nodes.index(x) for x in userid_nodes]
    indices_word = [all_nodes.index(x) for x in word_nodes]
    indices_entity = [all_nodes.index(x) for x in entity_nodes]
    del tid_nodes
    del userid_nodes
    del word_nodes
    del entity_nodes
    mins = (time() - start) / 60
    print('\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # ----------------------tweet-user-tweet----------------------
    print('\tStart constructing tweet-user-tweet commuting matrix ...')
    print('\t\t\tStart constructing tweet-user matrix ...')
    message += '\tStart constructing tweet-user-tweet commuting matrix ...\n\t\t\tStart constructing tweet-user ' \
               'matrix ...\n '
    start = time()
    w_tid_userid = A[np.ix_(indices_tid, indices_userid)]
    #  return a N(indices_tid)*N(indices_userid) matrix, representing the weight of edges between tid and userid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # convert to scipy sparse matrix
    print('\t\t\tConverting to sparse matrix ...')
    message += '\t\t\tConverting to sparse matrix ...\n'
    start = time()
    s_w_tid_userid = sparse.csr_matrix(w_tid_userid)  # matrix compression
    del w_tid_userid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tTransposing ...')
    message += '\t\t\tTransposing ...\n'
    start = time()
    s_w_userid_tid = s_w_tid_userid.transpose()
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tCalculating tweet-user * user-tweet ...')
    message += '\t\t\tCalculating tweet-user * user-tweet ...\n'
    start = time()
    s_m_tid_userid_tid = s_w_tid_userid * s_w_userid_tid  # homogeneous message graph
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tSaving ...')
    message += '\t\t\tSaving ...\n'
    start = time()
    if save_path is not None:
        sparse.save_npz(save_path + "s_m_tid_userid_tid.npz", s_m_tid_userid_tid)
        print("Sparse binary userid commuting matrix saved.")
        del s_m_tid_userid_tid
    del s_w_tid_userid
    del s_w_userid_tid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # ----------------------tweet-ent-tweet------------------------
    print('\tStart constructing tweet-ent-tweet commuting matrix ...')
    print('\t\t\tStart constructing tweet-ent matrix ...')
    message += '\tStart constructing tweet-ent-tweet commuting matrix ...\n\t\t\tStart constructing tweet-ent matrix ' \
               '...\n '
    start = time()
    w_tid_entity = A[np.ix_(indices_tid, indices_entity)]
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # convert to scipy sparse matrix
    print('\t\t\tConverting to sparse matrix ...')
    message += '\t\t\tConverting to sparse matrix ...\n'
    start = time()
    s_w_tid_entity = sparse.csr_matrix(w_tid_entity)
    del w_tid_entity
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tTransposing ...')
    message += '\t\t\tTransposing ...\n'
    start = time()
    s_w_entity_tid = s_w_tid_entity.transpose()
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tCalculating tweet-ent * ent-tweet ...')
    message += '\t\t\tCalculating tweet-ent * ent-tweet ...\n'
    start = time()
    s_m_tid_entity_tid = s_w_tid_entity * s_w_entity_tid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tSaving ...')
    message += '\t\t\tSaving ...\n'
    start = time()
    if save_path is not None:
        sparse.save_npz(save_path + "s_m_tid_entity_tid.npz", s_m_tid_entity_tid)
        print("Sparse binary entity commuting matrix saved.")
        del s_m_tid_entity_tid
    del s_w_tid_entity
    del s_w_entity_tid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # ----------------------tweet-word-tweet----------------------
    print('\tStart constructing tweet-word-tweet commuting matrix ...')
    print('\t\t\tStart constructing tweet-word matrix ...')
    message += '\tStart constructing tweet-word-tweet commuting matrix ...\n\t\t\tStart constructing tweet-word ' \
               'matrix ...\n '
    start = time()
    w_tid_word = A[np.ix_(indices_tid, indices_word)]
    del A
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # convert to scipy sparse matrix
    print('\t\t\tConverting to sparse matrix ...')
    message += '\t\t\tConverting to sparse matrix ...\n'
    start = time()
    s_w_tid_word = sparse.csr_matrix(w_tid_word)
    del w_tid_word
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tTransposing ...')
    message += '\t\t\tTransposing ...\n'
    start = time()
    s_w_word_tid = s_w_tid_word.transpose()
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tCalculating tweet-word * word-tweet ...')
    message += '\t\t\tCalculating tweet-word * word-tweet ...\n'
    start = time()
    s_m_tid_word_tid = s_w_tid_word * s_w_word_tid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    print('\t\t\tSaving ...')
    message += '\t\t\tSaving ...\n'
    start = time()
    if save_path is not None:
        sparse.save_npz(save_path + "s_m_tid_word_tid.npz", s_m_tid_word_tid)
        print("Sparse binary word commuting matrix saved.")
        del s_m_tid_word_tid
    del s_w_tid_word
    del s_w_word_tid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'

    # ----------------------compute tweet-tweet adjacency matrix----------------------
    print('\tComputing tweet-tweet adjacency matrix ...')
    message += '\tComputing tweet-tweet adjacency matrix ...\n'
    start = time()
    if save_path is not None:
        s_m_tid_userid_tid = sparse.load_npz(save_path + "s_m_tid_userid_tid.npz")
        print("Sparse binary userid commuting matrix loaded.")
        s_m_tid_entity_tid = sparse.load_npz(save_path + "s_m_tid_entity_tid.npz")
        print("Sparse binary entity commuting matrix loaded.")
        s_m_tid_word_tid = sparse.load_npz(save_path + "s_m_tid_word_tid.npz")
        print("Sparse binary word commuting matrix loaded.")

    s_A_tid_tid = s_m_tid_userid_tid + s_m_tid_entity_tid
    del s_m_tid_userid_tid
    del s_m_tid_entity_tid
    s_bool_A_tid_tid = (s_A_tid_tid + s_m_tid_word_tid).astype('bool')  # confirm the connect between tweets
    del s_m_tid_word_tid
    del s_A_tid_tid
    mins = (time() - start) / 60
    print('\t\t\tDone. Time elapsed: ', mins, ' mins\n')
    message += '\t\t\tDone. Time elapsed: '
    message += str(mins)
    message += ' mins\n'
    all_mins = (time() - all_start) / 60
    print('\tOver all time elapsed: ', all_mins, ' mins\n')
    message += '\tOver all time elapsed: '
    message += str(all_mins)
    message += ' mins\n'

    if save_path is not None:
        sparse.save_npz(save_path + "s_bool_A_tid_tid.npz", s_bool_A_tid_tid)
        print("Sparse binary adjacency matrix saved.")
        s_bool_A_tid_tid = sparse.load_npz(save_path + "s_bool_A_tid_tid.npz")
        print("Sparse binary adjacency matrix loaded.")

    # create corresponding dgl graph
    G = dgl.DGLGraph(s_bool_A_tid_tid)
    print('We have %d nodes.' % G.number_of_nodes())
    print('We have %d edges.' % G.number_of_edges())
    print()
    message += 'We have '
    message += str(G.number_of_nodes())
    message += ' nodes.'
    message += 'We have '
    message += str(G.number_of_edges())
    message += ' edges.\n'

    return all_mins, message

In [None]:
def construct_incremental_dataset_0922(df, save_path, features, test=True):
    # If test equals true, construct the initial graph using test_ini_size tweets
    # and increment the graph by test_incr_size tweets each day
    test_ini_size = 500
    test_incr_size = 100

    # save data splits for training/validate/test mask generation
    data_split = []
    # save time spent for the heterogeneous -> homogeneous conversion of each graph
    all_graph_mins = []
    message = ""
    # extract distinct dates
    distinct_dates = df.date.unique()  # 2012-11-07
    # print("Distinct dates: ", distinct_dates)
    print("Number of distinct dates: ", len(distinct_dates))
    print()
    message += "Number of distinct dates: "
    message += str(len(distinct_dates))
    message += "\n"

    # split data by dates and construct graphs
    # first week -> initial graph (20254 tweets)
    print("Start constructing initial graph ...")
    message += "\nStart constructing initial graph ...\n"
    ini_df = df.loc[df['date'].isin(distinct_dates[:7])]  # find top 7 dates
    if test:
        ini_df = ini_df[:test_ini_size]  # top test_ini_size dates
    G = construct_graph_from_df(ini_df)
    path = save_path + '0/'
    os.mkdir(path)
    grap_mins, graph_message = to_dgl_graph_v3(G, save_path=path)
    message += graph_message
    print("Initial graph saved")
    message += "Initial graph saved\n"
    # record the total number of tweets
    data_split.append(ini_df.shape[0])
    # record the time spent for graph conversion
    all_graph_mins.append(grap_mins)
    # extract and save the labels of corresponding tweets
    y = ini_df['event_id'].values
    y = [int(each) for each in y]
    np.save(path + 'labels.npy', np.asarray(y))
    print("Labels saved.")
    message += "Labels saved.\n"
    # extract and save the features of corresponding tweets
    indices = ini_df['index'].values.astype(np.int8).tolist()
    x = features[indices, :]
    np.save(path + 'features.npy', x)
    print("Features saved.")
    message += "Features saved.\n\n"

    # subsequent days -> insert tweets day by day (skip the last day because it only contains one tweet)
    for i in range(7, len(distinct_dates) - 1):
        print("Start constructing graph ", str(i - 6), " ...")
        message += "\nStart constructing graph "
        message += str(i - 6)
        message += " ...\n"
        incr_df = df.loc[df['date'] == distinct_dates[i]]
        if test:
            incr_df = incr_df[:test_incr_size]

        # All/Relevant Message Strategy: keeping all the messages when constructing the graphs
        # (for the Relevant Message Strategy, the unrelated messages will be removed from the graph later on).
        # G = construct_graph_from_df(incr_df, G)

        # Latest Message Strategy: construct graph using only the data of the day
        G = construct_graph_from_df(incr_df)

        path = save_path + str(i - 6) + '/'
        os.mkdir(path)
        grap_mins, graph_message = to_dgl_graph_v3(G, save_path=path)
        message += graph_message
        print("Graph ", str(i - 6), " saved")
        message += "Graph "
        message += str(i - 6)
        message += " saved\n"
        # record the total number of tweets
        data_split.append(incr_df.shape[0])
        # record the time spent for graph conversion
        all_graph_mins.append(grap_mins)
        # extract and save the labels of corresponding tweets
        # y = np.concatenate([y, incr_df['event_id'].values], axis = 0)
        y = [int(each) for each in incr_df['event_id'].values]
        np.save(path + 'labels.npy', y)
        print("Labels saved.")
        message += "Labels saved.\n"
        # extract and save the features of corresponding tweets
        indices = incr_df['index'].values.astype(np.int8).tolist()
        x = features[indices, :]
        # x = np.concatenate([x, x_incr], axis = 0)
        np.save(path + 'features.npy', x)
        print("Features saved.")
        message += "Features saved.\n"

    return message, data_split, all_graph_mins

In [None]:
len(loaded_features)

In [None]:
# load features
# the dimension of feature is 300 in this dataset

# loaded_features = np.load('features_69612_0709_spacy_lg_zero_multiclasses_filtered.npy')
save_path = 'data/'

# generate test graphs, features, and labels
message, data_split, all_graph_mins = construct_incremental_dataset_0922(df, save_path, loaded_features, True)
with open("spacy_full_node_edge_statistics.txt", "w") as text_file:
    text_file.write(message)

np.save('spacy_full_data_split.npy', np.asarray(data_split))
print("Data split: ", data_split)
np.save('spacy_full_all_graph_mins.npy', np.asarray(all_graph_mins))
print("Time sepnt on heterogeneous -> homogeneous graph conversions: ", all_graph_mins)

In [None]:
# # Load the homogenous message passing graph
# graph = nx.read_gpickle("data_filtered/0/fea")

In [None]:
# Define a node size function based on the node degree
node_sizes = [d * 10 for n, d in graph.degree()]

# Plot the graph with custom node color and size
nx.draw(graph, with_labels=False, node_color='blue', node_size=node_sizes)
plt.show()