In [1]:
import sys
sys.path.append('../libs/')

In [2]:
from file_utils import create_dir
from data_process import *
from network_sampling import *

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
tqdm.pandas()

In [5]:
# %load_ext autoreload
# %autoreload 2

In [6]:
def global_find_shortest_path(source=None, target=None):
    '''
    Returns the shortest path length between two users. If it doesn't exist, return -1

    G - networkx graph object
    source - source user
    target - target user
    '''
    global Gcc # make sure to use always the same graph
    try:
        sp = nx.shortest_path_length(Gcc, source=source, target=target)
    except:
        sp = -1
    return sp

### Load Retweet Network

Format of Retweet Network Dataframe:
- FromUser: ID of user who retweets
- ToUser: ID of user who got retweeted
- count: Number of retweets from FromUser to ToUser

In [7]:
path = "../dummy_data/network/retweet.csv"
net = pd.read_csv(path)

In [8]:
net.head()

Unnamed: 0,FromUser,ToUser,count
0,user76,user107,1.0
1,user1241,user483,1.0
2,user308,user1595,1.0
3,user189,user609,1.0
4,user820,user9,1.0


### Filter out interactions

Options:
- original: consider original retweet network
- active: consider retweet network of users with more than 1 retweet
- strong: consider retweet network of users with edge weight more than 1

In [9]:
net= keep_retweet_interactions(net, keep="original", t=2)

### Extract GCC of Network

In [10]:
Gcc = get_gcc_network(net, verbose=True)

Network size:
Graph with 1625 nodes and 993 edges

GCC size:
Graph with 50 nodes and 49 edges


### Load Tweet Texts

In [11]:
path = "../dummy_data/text/tweets.csv"
text_df = pd.read_csv(path)

In [12]:
text_df.head()

Unnamed: 0,nodeUserID,nodeID,topic,full_text
0,user624,text12,1,Made-up text from topic1 text12
1,user344,text39,1,Made-up text from topic1 text39
2,user357,text70,2,Made-up text from topic2 text70
3,user1364,text2,1,Made-up text from topic1 text2
4,user624,text81,2,Made-up text from topic2 text81


In [13]:
path = "../dummy_data/text/annotated_tweets.csv"
annotated_df = pd.read_csv(path)

In [14]:
annotated_df.head()

Unnamed: 0,nodeUserID,nodeID,topic,full_text,stance
0,user802,text101,3,Annotated text example,anti
1,user1375,text102,3,Annotated text example,pro
2,user259,text103,3,Annotated text example,pro
3,user585,text104,3,Annotated text example,anti
4,user1501,text105,3,Annotated text example,anti


### Text Preprocessing

In [15]:
# Drop duplicated IDs
text_df = text_df.drop_duplicates('nodeID').reset_index(drop=True)

In [16]:
# Extract clean string of characters
text_df["clean_str"] = text_df["full_text"].apply(clean_text)
annotated_df["clean_str"] = annotated_df["full_text"].apply(clean_text)

In [17]:
# Remove URLs from tweets
text_df['clean_text'] = text_df['full_text'].apply(just_text)

In [18]:
# Remove tweets with less than 20 characters
text_df = text_df.loc[text_df['clean_str'].str.len()>=20].reset_index(drop=True)

In [19]:
# Remove duplicated tweets
text_df = text_df.drop_duplicates(subset=['clean_str']) # Fuzzy duplicate detection

In [20]:
# Drop duplicated tweets from annotated data
drop_tweets = pd.merge(annotated_df, text_df, on='clean_str', how='inner')['nodeID_y'].unique()
text_df = text_df.loc[~text_df['nodeID'].isin(drop_tweets)].reset_index(drop=True)

In [21]:
# Create a mapping of IDs to texts
map_text = dict(zip(text_df.nodeID, text_df.clean_text))
map_topic = dict(zip(text_df.nodeID, text_df.topic))

### Split Text Dataframes

In [22]:
network_users = set(Gcc.nodes()) # Extract users in the network
engaged_users = list(set(text_df["nodeUserID"]) & network_users) # Extract users who post and interact

print("Number of users in the network:", len(network_users))
print("Number of engaged users:", len(engaged_users))

Number of users in the network: 50
Number of engaged users: 34


In [23]:
# Text dataframe of users who appear in the network and generate content
df_engaged = text_df[["nodeUserID", "nodeID", "topic"]].copy()
df_engaged = df_engaged.loc[df_engaged['nodeUserID'].isin(set(engaged_users))].reset_index(drop=True) 
df_engaged.head()

Unnamed: 0,nodeUserID,nodeID,topic
0,user624,text12,1
1,user344,text39,1
2,user357,text70,2
3,user1364,text2,1
4,user624,text81,2


In [24]:
# Text dataframe of users who only generate content
df_self = text_df[["nodeUserID", "nodeID", "topic"]].copy()
df_self = df_self.loc[~df_self['nodeUserID'].isin(set(engaged_users))].reset_index(drop=True)
df_self.head()

Unnamed: 0,nodeUserID,nodeID,topic
0,self user624,self text12,1
1,self user344,self text39,1
2,self user357,self text70,2
3,self user1364,self text2,1
4,self user624,self text81,2


### Split Users into Train, Val and Test

In [25]:
# Return a list of users and corresponding topic labels 
X, y = generate_topic_label_per_user(df_engaged)
X_self, y_self = generate_topic_label_per_user(df_self)

##### Sampling parameters

In [26]:
# Set ratio of users for train, val, and test splits
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15 

test_size = test_ratio/(test_ratio + val_ratio)

In [27]:
# Users in train split
X_train, new_X, y_train, new_y = generate_train_test_split(X, 
                                                           y, 
                                                           size_ratio=train_ratio)

X_train_self, new_X_self, y_train_self, new_y_self = generate_train_test_split(X_self, 
                                                                               y_self, 
                                                                               size_ratio=train_ratio)

In [28]:
# Users in validation and test splits
X_val, X_test, y_val, y_test = generate_train_test_split(new_X, 
                                                         new_y, 
                                                         size_ratio=test_size)

X_val_self, X_test_self, y_val_self, y_test_self = generate_train_test_split(new_X_self, 
                                                                             new_y_self, 
                                                                             size_ratio=test_size)

In [29]:
print(f"Number of users in train split: {len(X_train)} in network and {len(X_train_self)} not in network")
print(f"Number of users in val split: {len(X_val)} in network and {len(X_val_self)} not in network")
print(f"Number of users in test split: {len(X_test)} in network and {len(X_test_self)} not in network")

Number of users in train split: 23 in network and 23 not in network
Number of users in val split: 5 in network and 5 not in network
Number of users in test split: 6 in network and 6 not in network


### Randomly Sample User Pairs

##### Parameters for split size

In [30]:
n = 200 # total number of examples

n_train = round(n * 0.7) # % of pairs on training
n_val = round(n * 0.15) # % of pairs on validation
n_test = round(n * 0.15) # % of pairs on test

print(n_train, n_val, n_test)

140 30 30


##### Compute weighted probability of topics from the train split

In [31]:
train_users = X_train + X_train_self

In [32]:
topic_weights = compute_topic_probability(text_df, user_list=train_users)

##### Sampling parameters for users pairs 

In [34]:
params = {
    # Parameters for distant user pairs
    "n_users":100, # Number of random users to sample at each iteration 
    "n_pairs":100, # Number of pairs to sample from n_users combinations
    
    # Parameters for close user pairs
    "max_onehop_pairs":2, # The number of 1-hop neighbors to draw for each seed user
    "max_twohop_pairs":2, # The number of 2-hop neighbors to draw for each seed user
    "max_neighbors":7, # The number of 2-hop neighbors to explore for each seed user

    "frac_one_hop":0.5, # Fraction of 1-hop neighbors out of close pairs size 
    "close_pairs_split":0.45, # Fraction of pairs from close connections out of all connections
    "topic_weights":topic_weights # topic weights for stratification
}

##### Sample Train Split

In [35]:
%%time
df_close_train, df_distant_train = get_pairs_from_topics(Gcc, 
                                                         text_df, 
                                                         users=X_train, 
                                                         network_users=network_users,  
                                                         params=params, 
                                                         n=n_train, 
                                                         non_network_users=X_train_self)

Processing Topic: 1
# 0-hop pairs:14


100%|████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 10770.41it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 3797.06it/s]


# 1-hop pairs:0
# 2-hop pairs:20
# Distant pairs:52
----------------------------------------------------------------------------------------------------

Processing Topic: 2
# 0-hop pairs:7


100%|████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 16905.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 5901.65it/s]

# 1-hop pairs:0
# 2-hop pairs:10
# Distant pairs:25
----------------------------------------------------------------------------------------------------

CPU times: user 202 ms, sys: 14.5 ms, total: 216 ms
Wall time: 209 ms





In [36]:
len(df_close_train), len(df_distant_train)

(51, 77)

##### Sample Val Split

In [37]:
%%time
df_close_val, df_distant_val = get_pairs_from_topics(Gcc, 
                                                     text_df, 
                                                     users=X_val, 
                                                     network_users=network_users,  
                                                     params=params, 
                                                     n=n_val, 
                                                     non_network_users=X_val_self)

Processing Topic: 1
# 0-hop pairs:3


100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 8709.10it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 4110.45it/s]


# 1-hop pairs:0
# 2-hop pairs:4
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
# Distant pairs:6
----------------------------------------------------------------------------------------------------

Processing Topic: 2
# 0-hop pairs:1


100%|██████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 11554.56it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 5318.22it/s]

# 1-hop pairs:0
# 2-hop pairs:3
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
# Distant pairs:0
----------------------------------------------------------------------------------------------------

CPU times: user 85.3 ms, sys: 8.72 ms, total: 94 ms
Wall time: 85.5 ms





In [38]:
len(df_close_val), len(df_distant_val)

(11, 6)

##### Sample Test Split

In [39]:
%%time
df_close_test, df_distant_test = get_pairs_from_topics(Gcc, 
                                                       text_df,  
                                                       users=X_test, 
                                                       network_users=network_users,  
                                                       params=params, 
                                                       n=n_test, 
                                                       non_network_users=X_test_self)

Processing Topic: 1
# 0-hop pairs:3


100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 7423.55it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 4211.99it/s]


# 1-hop pairs:0
# 2-hop pairs:5
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
# Distant pairs:5
----------------------------------------------------------------------------------------------------

Processing Topic: 2
# 0-hop pairs:1


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11915.64it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 5769.33it/s]

# 1-hop pairs:0
# 2-hop pairs:3
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
Distant pairs added to candidate pool: 0...
# Distant pairs:3
----------------------------------------------------------------------------------------------------

CPU times: user 88.7 ms, sys: 26.2 ms, total: 115 ms
Wall time: 107 ms





### Compute Shortest Path Lengths

In [40]:
%%time
df_distant_train["sp"] = df_distant_train.progress_apply(lambda x: global_find_shortest_path(x["source"],x["target"]), axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:00<00:00, 7652.57it/s]

CPU times: user 13.2 ms, sys: 1.98 ms, total: 15.2 ms
Wall time: 13.7 ms





In [41]:
%%time
df_distant_val["sp"] = df_distant_val.progress_apply(lambda x: global_find_shortest_path(x["source"],x["target"]), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 2257.02it/s]

CPU times: user 6.1 ms, sys: 925 µs, total: 7.03 ms
Wall time: 5.96 ms





In [42]:
%%time
df_distant_test["sp"] = df_distant_test.progress_apply(lambda x: global_find_shortest_path(x["source"],x["target"]), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3579.90it/s]

CPU times: user 5.31 ms, sys: 1.26 ms, total: 6.57 ms
Wall time: 5.39 ms





### Prepare Data Splits

In [43]:
train = pd.concat([df_close_train, df_distant_train], ignore_index=True)
val = pd.concat([df_close_val, df_distant_val], ignore_index=True)
test = pd.concat([df_close_test, df_distant_test], ignore_index=True)

In [44]:
train.shape, val.shape, test.shape

((128, 5), (17, 5), (20, 5))

In [45]:
# Shuffle data
train = train.sample(frac=1).reset_index(drop=True)
val = val.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [46]:
# Add tweet text
train["tweet1"] = train["source_tweet"].map(map_text)
train["tweet2"] = train["target_tweet"].map(map_text)

val["tweet1"] = val["source_tweet"].map(map_text)
val["tweet2"] = val["target_tweet"].map(map_text)

test["tweet1"] = test["source_tweet"].map(map_text)
test["tweet2"] = test["target_tweet"].map(map_text)

In [47]:
# add tweet topic
train["tweet1_topic"] = train["source_tweet"].map(map_topic)
train["tweet2_topic"] = train["target_tweet"].map(map_topic)

val["tweet1_topic"] = val["source_tweet"].map(map_topic)
val["tweet2_topic"] = val["target_tweet"].map(map_topic)

test["tweet1_topic"] = test["source_tweet"].map(map_topic)
test["tweet2_topic"] = test["target_tweet"].map(map_topic)

### Save Data Splits

In [None]:
save_dir = "../dummy_data/data_splits"
# create_dir(path_to_save)

In [None]:
# Save data splits
save_train = os.path.join(save_dir, "train.csv")
save_val = os.path.join(save_dir, "val.csv")
save_test = os.path.join(save_dir, "test.csv")

if not os.path.exists(save_train):
    train.to_csv(save_train, index=False)
    print("saved", save_train)
if not os.path.exists(save_val):
    val.to_csv(save_val, index=False)
    print("saved", save_val)
if not os.path.exists(save_test):
    test.to_csv(save_test, index=False)
    print("saved", save_test)