In [1]:
import pandas as pd
import numpy as np
import re
import json
import networkx as nx
from networkx.algorithms import bipartite 
import matplotlib.pyplot as plt

In [2]:
# load tweet topic data
tweet_topic = pd.read_csv("../../../data/tweet_topic.csv")

# create topic dictionary
topic_assign = pd.read_csv("../../../data/topic_dictionary.csv")
topic_dict = dict(zip(topic_assign['id_topic'], topic_assign['label']))

# load all tweet data
tweets = pd.read_csv("../../../data/tweets.csv")
tweets['hashtag'] = tweets['full_text'].apply(lambda x: re.findall(r'#(\w+)', x))
tweets["is_reply"] = [int(~np.isnan(tweet)) for tweet in tweets["in_reply_twitter_id"]]
tweets = tweets.rename(columns = {'id':'id_tweet'})

# load users
users = pd.read_csv("../../../data/users.csv")
user_dict = dict(zip(users['userid'],users['username']))
tweet_user_dict = dict(zip(tweets['id_tweet'],tweets['user_id']))

# load hashtags
tweet_hashtag = pd.read_csv("../../../data/tweet_hashtag.csv")
hashtag = pd.read_csv("../../../data/hashtag.csv")

In [3]:
# load the full topic assignment from the lda
# topics are already processed and are between 1 and 10.
with open('../../../ColombianPoliticsSentiment/final_lda_model/topic_assign_full.json') as f:
    tweet_topics = [[(int(i[0]),int(i[1]),float(i[2])) for i in t] for t in json.load(f)]

In [4]:
# filter topics with score > 20 and normalise scores
filt_topics = [[j for j in b if j[2] > 0.2] for b in tweet_topics]

# define edges between all tweets and topics 
edges = []
for idx, t in enumerate(filt_topics):
    score_sum = sum([i[2] for i in t])
    edges += [(i[0],str("t_"+str(i[1])),i[2]/score_sum) for i in t]

In [5]:
tweet_ids = tweets['id_tweet'][tweets['is_reply'] == 0]

In [6]:
# subset edges for original tweets only
edges_tweets_only = [i for i in edges if i[0] in tweet_ids]

In [7]:
# create the network between tweets and topics only for original tweets
G = nx.Graph()
G.add_weighted_edges_from(edges_tweets_only)

In [35]:
# add main topic attribute to each node
# create dictionary between tweet and respective main topic
tweet_to_topic_dict = dict(zip(tweet_topic['id_tweet'],tweet_topic['id_topic']))
adapt_topic_dict = dict(zip(["t_"+str(j) for j in range(1,11)],list(topic_dict.values())))

# add attributes
for i in G.nodes:
    try: 
        G.nodes[i]['topic'] = adapt_topic_dict[i]
        G.nodes[i]['main_topic'] = topic_dict[tweet_to_topic_dict[i]]
        G.nodes[i]['author'] = user_dict[tweet_user_dict[i]]
    except:
        pass

t_1


In [49]:
# get all tweet ids to create the nodes
tweet_nodes = list(np.unique([j[0] for j in edges_tweets_only]))
# project network on tweets
tweet_proj = bipartite.weighted_projected_graph(G,nodes = tweet_nodes)

In [50]:
print(len(tweet_proj.nodes()))
print(len(tweet_proj.edges()))

1341
340638


In [51]:
# save networks
nx.write_graphml(tweet_proj,"projected_tweet_topic_network.graphml")
nx.write_graphml(G,"tweet_topic_network.graphml")