In [None]:
import numpy as np
import pandas as pd

In [None]:
# path to collected twitter data and specified files with "trend" prefix
path = './twitter_data'
trend = '???'

In [None]:
user_data_org = pd.read_csv(f'{path}/{trend}_user_data.csv')
user_data_org.info()

In [None]:
user_data_org.head(2)

In [None]:
user_data_org.drop_duplicates()
user_data = user_data_org.filter(['user_id','user_screen_name','followers_count', 'friends_count','user_verified'])
user_data.head(2)

In [None]:
# simple algorithm to assign weights (size) to nodes based on number of friends/followers
def calc_weight(row):
    weight = 50
    if row['user_verified']:
        weight += 100
    
    #followers_count impact
    if row['followers_count'] < 50: weight = weight
    elif row['followers_count'] < 100: weight = weight + 25
    elif row['followers_count'] < 500: weight = weight + 50
    elif row['followers_count'] < 1000: weight = weight + 75
    else: weight = weight + 100
    
    #friends_count impact
    if row['friends_count'] < 25: weight = weight
    elif row['friends_count'] < 100: weight = weight + 50
    elif row['friends_count'] < 500: weight = weight + 100
    elif row['friends_count'] < 1000: weight = weight + 150
    else: weight = weight + 200
    
    return weight

In [None]:
user_data['size'] = user_data.apply(lambda row: calc_weight(row), axis=1)
user_data.tail(2)

In [None]:
# pick node (user_name) and size for nodes list
node_list_1 = user_data.drop(['user_id','followers_count','friends_count','user_verified'], axis=1)
node_list_1.rename({'user_screen_name':'name'}, axis=1, inplace=True)
node_list_1.head(2)                            

In [None]:
tweet_data_org = pd.read_csv(f'{path}/{trend}_tweets.csv')
tweet_data_org.info()

In [None]:
tweet_data_org.drop_duplicates()
tweet_data_org.head(2)

In [None]:
if 'user_screen_name' in tweet_data_org.columns:
    tweet_data = tweet_data_org.filter(['tweet_id','in_reply_to_name','user_mentions_name','user_id','user_screen_name'])
else:
    tweet_data = tweet_data_org.filter(['tweet_id','in_reply_to_name','user_mentions_name','user_id'])
    if tweet_data.shape[0] == user_data.shape[0]:
        tweet_data['user_screen_name'] = user_data['user_screen_name']
    else:
        print("Unmatched dimension")
tweet_data.tail(2)

In [None]:
# Filter all rows in which in_reply_to_name or user_mentions_name are not both NaN
tweet_data.dropna(thresh=4, inplace=True)
tweet_data.head(2)

In [None]:
# simple algorithm to assign weights to interactions (edge) based on mentions/replies to tweets
def check_mentions_replies(row):
    weight = 0
    
    # user_mentions impact
    if (not pd.isna(row.user_mentions_name)) and (row.user_mentions_name != row.user_screen_name):
        weight += 15
    if (not pd.isna(row.in_reply_to_name)) and (row.in_reply_to_name != row.user_screen_name) and (row.in_reply_to_name != row.user_mentions_name):
        weight += 15
    
    return weight

In [None]:
tweet_data['weight'] = tweet_data.apply (lambda row: check_mentions_replies(row), axis=1)
tweet_data.head(2)

In [None]:
tweet_data['edge'] = tweet_data.apply (lambda row: (row.user_screen_name, row.user_mentions_name) if not pd.isna(row.user_mentions_name) else (row.user_screen_name, row.in_reply_to_name), axis=1)
tweet_data.head(2)

In [None]:
# pick edge (user_name pair) and related weight
edge_data_1 = tweet_data.filter(['edge','weight'])
edge_data_1 = edge_data_1[edge_data_1['weight'] > 0]
edge_data_1.head(2)

In [None]:
retweeters_org = pd.read_csv(f'{path}/{trend}_retweeters.csv')
retweeters_org.info()

In [None]:
retweeters = retweeters_org.drop_duplicates()
retweeters.head(2)

In [None]:
# pick retweeters as new nodes and their weight
node_list_2 = retweeters.filter(['retweeter_name'])
node_list_2.rename({'retweeter_name': 'name'}, axis=1, inplace=True)
node_list_2['size'] = 50
node_list_2.head(2)

In [None]:
retweeters['weight'] = 25
retweeters['edge'] = retweeters.apply(lambda row:(row.orig_user_name, row.retweeter_name), axis=1)
retweeters.tail(2)

In [None]:
# pick edge (tweeter/retweeter pair) and related weight
edge_data_2 = retweeters.filter(['edge','weight'])
edge_data_2 = edge_data_2[edge_data_2['weight'] > 0]
edge_data_2.head(2)

In [None]:
friends_org = pd.read_csv(f'{path}/{trend}_friends.csv')
friends_org.info()

In [None]:
# pick friends list as new nodes and their related weight
friends_org.drop_duplicates(inplace=True)
node_list_3 = friends_org.filter(['friend_name'])
node_list_3.rename({'friend_name': 'name'}, axis=1, inplace=True)
node_list_3['size'] = 50
node_list_3.head(2)

In [None]:
# pick edge (tweeter/friends pair) and related weight
friends_edges = friends_org.filter(['orig_user_name','friend_name'])
friends_edges['weight'] = 15
friends_edges['edge'] = friends_edges.apply(lambda row:(row.orig_user_name, row.friend_name), axis=1)
edge_data_3 = friends_edges.filter(['edge','weight'])
edge_data_3.tail(2)

In [None]:
followers_org = pd.read_csv(f'{path}/{trend}_followers.csv')
followers_org.info()

In [None]:
# pick followers list as new nodes and their related weight
followers_org.drop_duplicates(inplace=True)
node_list_4 = followers_org.filter(['follower_name'])
node_list_4.rename({'follower_name': 'name'}, axis=1, inplace=True)
node_list_4['size'] = 50
node_list_4.head(2)

In [None]:
# pick edge (tweeter/followers pair) and related weight
followers_edges = followers_org.filter(['orig_user_name','follower_name'])
followers_edges['weight'] = 10
followers_edges['edge'] = followers_edges.apply(lambda row:(row.orig_user_name, row.follower_name), axis=1)
edge_data_4 = followers_edges.filter(['edge','weight'])
edge_data_4.tail(2)

In [None]:
node_list = pd.concat([node_list_1, node_list_2,node_list_3, node_list_4])
node_list.info()

In [None]:
# remove duplicate nodes from list
node_list.drop_duplicates(inplace=True)
node_list_final = node_list.sort_values(by=['name', 'size'])
node_list_final = node_list_final.drop_duplicates(subset=['name'], keep='last')
node_list_final.info()

In [None]:
edge_data = pd.concat([edge_data_1,edge_data_2,edge_data_3,edge_data_4])
edge_data.info()

In [None]:
# remove duplicate edges from list
edge_data.drop_duplicates(inplace=True)
edge_data_final = edge_data.sort_values(by=['edge', 'weight'])
edge_data_final = edge_data_final.drop_duplicates(subset=['edge'], keep='last')
edge_data_final.info()

In [None]:
# save nodes and edges lists in CSV files
node_list_final.to_csv(f'{path}/{trend}_node_list.csv', index = False)
edge_data_final.to_csv(f'{path}/{trend}_edge_data.csv', index = False)