In [1]:
# Import of libraries
import pandas as pd
import numpy as np
import datetime
import time
import json
import re
import glob
import os

# Definition of Functions

In [2]:
# Function returns dataframe consisting of the separate files in specified directory

def import_data_from_folder(path):

    csv_files = glob.glob(os.path.join(path, "*.json"))
    frames = []
    
    # loop over the list of files
    for f in csv_files:

        # read the csv file
        df = pd.read_json(f)
        frames.append(df)
    
    # Concat DataFrame list to single tweet_dfFrame
    res_df = pd.concat(frames)
    res_df.reset_index(inplace=True)

    # Drop Index column which is also created while importing files
    res_df.drop(columns=['index'], inplace=True)
    
    return res_df

In [3]:
# Function returns a list containing all hashtags in the given list of JSON objects

def extract_hashtags(hashtag_ls:list):
    
    hashtags = [item.get('tag','') for item in hashtag_ls]
    
    return hashtags

In [4]:
# Function cleans twitter_query string 

def clean_twitter_query(query:str):
    
    query = re.sub('from:','',query)

    return query

In [5]:
# #DEPRECATED
# # Calculating the Nodes for the network
# def get_griffin_nodes(tweet_df:pd.DataFrame):

#     nodes_ls = []

#     for index, user in data.iterrows():
#         #print(user.get('public_metrics').get())
#         node = {}
        
#         # Author_id is only used for later joining
#         node['author_id'] =  user.get('id','')
#         node['id'] = user.get('username','')
#         node['name'] = user.get('name','')

#         # Favourites Counts is not provided by Twitter API V2
#         node['favourites_count'] = 0 # TODO muss noch ergänzt werden, wird von der Twitter API nicht zurückgegeben
#         node['followers_count'] = user.get("public_metrics",{}).get('followers_count','')
#         node['friends_count'] =  user.get("public_metrics",{}).get('following_count','')
#         node['listed_count']= user.get("public_metrics",{}).get('listed_count','')
#         node['statuses_count'] = user.get("public_metrics",{}).get('tweet_count','')
#         node['twitter_query'] = re.sub('from:','',user.get("twitter_query",''))

#         nodes_ls.append(node)

#     nodes_df = pd.DataFrame(nodes_ls)
    
#     return nodes_df

In [6]:
# Calculating the Nodes for the network
def get_griffin_nodes(data:pd.DataFrame):

    nodes_df =  data[[
                    'id',
                    'username',
                    'name',
                    'public_metrics',
                    'twitter_query'
                    ]]
    
    # Cleaning up 'twitter_query' column
    nodes_df['twitter_query'] = nodes_df['twitter_query'].apply(lambda query: clean_twitter_query(query))

    # Extracting public_metrics out of the user dataframe
    public_metrics_df = pd.json_normalize(nodes_df['public_metrics'])
    nodes_df = nodes_df.merge(public_metrics_df, how='inner', left_index=True, right_index=True)

    # Adding favourites_count
    nodes_df['favourites_count'] = 0

    # Renaming the columns
    nodes_df.rename(columns={
        'id':'author_id',
        'username':'id',
        'following_count':'friends_count',
        'tweet_count':'statuses_count',
    }, inplace=True)

    # Removing duplicates
    # Assumption: Ordering by statuses (total tweet) count will deliver a list of users where the most recent data of the user are in 
    # the last record (ascending order)
    nodes_df.sort_values(by=['author_id','statuses_count'], inplace=True, ascending=True)
    nodes_df.drop_duplicates(subset='author_id', inplace=True, keep='last')
    nodes_df.reset_index(drop=True, inplace=True)

    # Reordering the columns
    nodes_df = nodes_df[[
        'author_id',
        'id',
        'name',
        'favourites_count',
        'followers_count',
        'friends_count',
        'listed_count',
        'statuses_count',
        'twitter_query'
    ]]
    
    return nodes_df

In [7]:
def clean_nodes(df):
    #clean name from unwanted characters
    df['name'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
    df['name'] = df['name'].replace({'"': '' }, regex=True)
    df['name'] = df['name'].replace({'\'': '' }, regex=True)
    df['name'] = df['name'].replace({' &amp;': '' }, regex=True)
    df['name'] = df['name'].replace({';': ',' }, regex=True)
    return df

In [8]:
def clean_edges(df):
    #clean twitter text form unwanted characters
    df['text'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
    df['text'] = df['text'].replace({'"': '' }, regex=True)
    df['text'] = df['text'].replace({';': ',' }, regex=True)
    df['text'] = df['text'].replace({' &amp;': '' }, regex=True)
    df['text'] = df['text'].replace({'&amp;': '' }, regex=True)
    #clean source- and destination-name from unwanted characters
    df['src_name'] = df['src_name'].replace({'\'': '' }, regex=True)
    df['dst_name'] = df['dst_name'].replace({'\'': '' }, regex=True)
    df['src_name'] = df['src_name'].replace({';': ',' }, regex=True)
    df['dst_name'] = df['dst_name'].replace({';': ',' }, regex=True)
    #format the datetime to be griffin readable
    df['created_at'] = pd.to_datetime(df["created_at"], format="%Y-%m-%dT%H:%M:%S.%f")
    return df

# Importing Data

In [9]:
# Importing all files into dataframes

tweet_df = import_data_from_folder("../Data/Tweets/")
user_df = import_data_from_folder("../Data/Users/")
#ref_tweet_df = import_data_from_folder("../Data/Retweets/")
#media_df = import_data_from_folder("../Data/Media/")
#place_df = import_data_from_folder("../Data/Place/")


ValueError: No objects to concatenate

# Calculation of Griffin Nodes

In [None]:
nodes_df = get_griffin_nodes(user_df)
#nodes_df.drop(columns='author_id', inplace=True)


# Tweet Data Cleaning (for calculating Edges)

In [None]:
# Extract Tweet type and discard all retweets --> For constructing the network, retweets only lead to a high degree
# Computationally the calculation of the network is still very very costly
tweet_df = tweet_df.explode(column='referenced_tweets')
tweet_df

In [None]:
def is_retweet(ref_tweet:dict):
    is_retweet = False

    if(ref_tweet):
        if(ref_tweet.get('type') == 'retweeted'):
            is_retweet = True
        
    return is_retweet

In [None]:
tweet_df['retweet'] = tweet_df['referenced_tweets'].apply(lambda ref_tweet: is_retweet(ref_tweet))
#tweet_df

In [None]:
# Drop all (actual) Retweets from tweet_df
# Data set then contains Replies, Quoted Tweets (commented Retweets) and regular Tweets
tweet_cleaned_df = tweet_df[tweet_df['retweet'] == False]
#tweet_cleaned_df

# Calculation of Griffin Edges

Apparently there are some Tweets without entities (mentions AND hashtags). Those can be dropped, because they would be displayed as single points in the network or as reference to the twitter search query.

Note: We also filtered out / do not calculate the connections between nodes and the twitter_search_query. These would be filtered out in the first steps in Griffin anyway.

In [None]:
# Dropping Tweets without any entities / mentions
tweet_cleaned_df.dropna(subset=['entities'], inplace=True)
len(tweet_cleaned_df)

In [None]:
# Drop duplicate tweets (based on same author_id and text)
tweet_cleaned_df.drop_duplicates(subset=['author_id', 'text'], inplace=True, keep='first')
tweet_cleaned_df.info()

## Extracting Mentions and Hashtags

In [None]:
# Extract mentions and hashtags list from entities
tweet_cleaned_df['mentions'] = tweet_cleaned_df['entities'].apply(lambda entity: entity.get('mentions'))
tweet_cleaned_df['hashtags_ls'] = tweet_cleaned_df['entities'].apply(lambda entity: entity.get('hashtags'))
tweet_cleaned_df.head()

In [None]:
# Dropping all rows which don't contain any mentions, because they will appear as points in the network

tweet_cleaned_df.dropna(axis=0, subset='mentions', inplace=True)
len(tweet_cleaned_df)

In [None]:
# Expand all mentions in dataframe
tweet_cleaned_df = tweet_cleaned_df.explode(column='mentions')
tweet_cleaned_df.reset_index(drop=True, inplace=True)
len(tweet_cleaned_df)

In [None]:
# Extract username and ids of mentioned users into Griffin edge attributes dst, dst_screen_name, dst_id_str
tweet_cleaned_df['dst'] = tweet_cleaned_df['mentions'].apply(lambda mention: mention.get('username',''))
tweet_cleaned_df['dst_screen_name'] = tweet_cleaned_df['mentions'].apply(lambda mention: mention.get('username',''))
tweet_cleaned_df['dst_id_str'] = tweet_cleaned_df['mentions'].apply(lambda mention: mention.get('id',''))

In [None]:
# Extract hashtags from tweets into list for each Tweet
tweet_cleaned_df['hashtags'] = tweet_cleaned_df[tweet_cleaned_df['hashtags_ls'].notna()]['hashtags_ls'].apply(lambda ls: extract_hashtags(ls))

## Dropping irrelevant columns (for Griffin Edges)

In [None]:
# Drop irrelevant columns for Griffin
edges_df = tweet_cleaned_df.drop(columns=['entities',
               'id',
               'referenced_tweets',
               'lang',
               'public_metrics',
               'conversation_id',
               'in_reply_to_user_id',
               'attachments',
               'geo',
               'withheld',
               'mentions',
               'hashtags_ls',
               'retweet'], axis=1)

edges_df.rename(columns={'author_id':'src_id_str'}, inplace=True)
edges_df['dst_id_str'] = edges_df['dst_id_str'].astype(int)
#edges_df.head(5)

In [None]:
edges_df.info()

## Merging Nodes (src and dst) to create griffin edges

In [None]:
# Create Dataframe containing the required fields for SRC nodes
src = nodes_df[['author_id',
               'id',
               'name',
               'favourites_count',
               'followers_count',
               'friends_count',
               'listed_count',
               'statuses_count']].copy()

src.rename(columns={
   'author_id':'src_id_str',
   'id':'src_screen_name',
   'name':'src_name',
   'favourites_count':'src_favourites_count',
   'followers_count':'src_followers_count',
   'friends_count':'src_friends_count',
   'listed_count':'src_listed_count',
   'statuses_count':'src_statuses_count'
    }, inplace=True)


In [None]:
# Merge dataframes to create SRC columns

edges_src = edges_df.merge(src, how='left', left_on='src_id_str', right_on='src_id_str')
edges_src

In [None]:
# Create Dataframe containing the required fields for DST nodes
dst = nodes_df[['author_id',
               'name'
               ]].copy()

dst.rename(columns={
   'author_id':'dst_id_str',
   'name':'dst_name'
    }, inplace=True)

dst['dst_id_str'] = dst['dst_id_str'].astype(int)

In [None]:
# Merge dataframes to create DST columns

edges_df = edges_src.merge(dst, how='inner', left_on='dst_id_str', right_on='dst_id_str')
edges_df

In [None]:
# Add Unix Time Stamp Column
edges_df['time'] = edges_df['created_at'].apply(lambda x: int(time.mktime(x.timetuple())))

In [None]:
# Add src column
edges_df['src'] = edges_df['src_screen_name']

In [None]:
# Reorder Columns for Griffin Format
edges_df[['src_favourites_count',
'src_followers_count',
'src_friends_count',
'src_listed_count',
'src_statuses_count',
'src_name',
'src_screen_name',
'src_id_str',
'text',
'created_at',
'twitter_query',
'hashtags',
'dst_name',
'dst_screen_name',
'dst_id_str',
'time',
'src',
'dst']]


In [None]:
# Exporting Edges and Nodes
# Files contain all
#   Nodes (all user data)
#   Edges (just Tweets, Retweets are filtered out in the above steps)

nodes_df.drop(columns='author_id', inplace=True)
export_nodes_df = clean_nodes(nodes_df)
export_edges_df = clean_edges(edges_df)
export_nodes_df.to_csv('../Data/Griffin/nodes.csv', index=False)
export_edges_df.to_csv('../Data/Griffin/edges.csv', index=False)

# Calculating basic metrics and filtering

In [None]:
edges_df.info()

In [None]:
nodes_df.info()

## Calculating Communication (OUT-Degree) and IN-Degree

In [None]:
# Function to calculate number of nodes at a specific minimum of filter criterion
def calculate_nodes(df:pd.DataFrame, metric:str, min:int):
    
    nodes_count = len(df[df[metric] >= min])

    return nodes_count

In [None]:
# Function to calculate number of edges at a specific minimum of filter criterion
def calculate_edges(df:pd.DataFrame, metric:str, min:int):
    
    edges_count = int(df[df[metric] >= min].sum())

    return edges_count 

In [None]:
# Function to create dataframe containing number of nodes and edgs at different levels of filters

def calculate_metrics(df, metric):
    node_count = []
    edge_count = []
    min = []

    for i in range(1, 1000):
        node_count.append(calculate_nodes(df, metric, i))
        edge_count.append(calculate_edges(df, metric, i))

        min.append(i)

    metrics = pd.DataFrame({
                        'node_count':node_count,
                        'edges_count':edge_count}
                        ,index=min)
    return metrics

In [None]:
# Calculate IN-Degree Metric
degree_df = pd.DataFrame(data={'in_degree':edges_df.groupby(by='dst_id_str').count()['src_id_str']})
degree_df.sort_values(by='in_degree', ascending=False, inplace=True)
degree_df

In [None]:
deg_metrics = calculate_metrics(degree_df,'in_degree')
deg_metrics

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot(metrics, start, end):
    
    plt.figure(figsize=(14,4))

    x = range(start,end)
    y1 = metrics['node_count'].iloc[start:end].values
    y2 = metrics['edges_count'].iloc[start:end].values

    plt.subplot(121)
    plt.plot(x,y1)
    plt.xlabel('min degree')
    plt.ylabel('number of nodes')
    plt.yscale('log')
    
    plt.subplot(122)
    plt.plot(x,y2)
    plt.xlabel('min degree')
    plt.ylabel('number of edges')
    plt.yscale('log')

    plt.show()

In [None]:
plot(deg_metrics,20,100)

Probably a degree around 50 is a reasonable value to filter the nodes and edges

In [None]:
# Apparently there are only 758 nodes with a degree greater than 50, with 204000 Edges connecting them
deg_metrics.loc[50]

In [None]:
dst_nodes_filtered = degree_df[degree_df['in_degree'] >= 30]
len(dst_nodes_filtered)

In [None]:
# Just checking if the calculated numbers above are correct
dst_nodes_filtered.sum()

In [None]:
# Select only edges, where dst_id_str is contained in nodes with degree >= 50
edges_deg_filtered_df = edges_df[edges_df['dst_id_str'].isin(dst_nodes_filtered.index)]
edges_deg_filtered_df.info()

In [None]:
# Calculate Communication for remaining nodes
com_src_nodes = pd.DataFrame(edges_deg_filtered_df.groupby(by=['src_id_str']).count()['src'])
com_src_nodes.rename(columns={'src':'communication_freq'}, inplace=True)
len(com_src_nodes)

In [None]:
com_src_nodes.describe()

In [None]:
com_metrics = calculate_metrics(com_src_nodes, 'communication_freq')
com_metrics.loc[3:10]

In [None]:
# Drop all Edges (and SRC Nodes), which contribute less than average to the network (min. 8 Tweets, Replies, Quotes)
src_nodes_filtered = com_src_nodes[com_src_nodes['communication_freq'] >= 8]
src_nodes_filtered

In [None]:
edges_filtered_df = edges_deg_filtered_df[edges_deg_filtered_df['src_id_str'].isin(src_nodes_filtered.index)]
edges_filtered_df

In [None]:
# Select nodes which are SRC or DST 
nodes_filtered_df = nodes_df[nodes_df['author_id'].isin(dst_nodes_filtered.index) | nodes_df['author_id'].isin(src_nodes_filtered.index)]
nodes_filtered_df.reset_index(drop=True, inplace=True)
nodes_filtered_df

In [None]:
def clean_nodes(df):
    #clean name from unwanted characters
    df['name'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
    df['name'] = df['name'].replace({'"': '' }, regex=True)
    df['name'] = df['name'].replace({'\'': '' }, regex=True)
    df['name'] = df['name'].replace({' &amp;': '' }, regex=True)
    df['name'] = df['name'].replace({';': ',' }, regex=True)
    return df

In [None]:
def clean_edges(df):
    #clean twitter text form unwanted characters
    df['text'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)
    df['text'] = df['text'].replace({'"': '' }, regex=True)
    df['text'] = df['text'].replace({';': ',' }, regex=True)
    df['text'] = df['text'].replace({' &amp;': '' }, regex=True)
    df['text'] = df['text'].replace({'&amp;': '' }, regex=True)
    #clean source- and destination-name from unwanted characters
    df['src_name'] = df['src_name'].replace({'\'': '' }, regex=True)
    df['dst_name'] = df['dst_name'].replace({'\'': '' }, regex=True)
    df['src_name'] = df['src_name'].replace({';': ',' }, regex=True)
    df['dst_name'] = df['dst_name'].replace({';': ',' }, regex=True)
    #format the datetime to be griffin readable
    df['created_at'] = pd.to_datetime(df["created_at"], format="%Y-%m-%dT%H:%M:%S.%f")
    return df

In [None]:
# Exporting Edges and Nodes

nodes_filtered_df.drop(columns='author_id', inplace=True)
export_nodes_df = clean_nodes(nodes_filtered_df)
export_edges_df = clean_edges(edges_filtered_df)
export_nodes_df.to_csv('nodes_d30_com8.csv', index=False)
export_edges_df.to_csv('edges_d30_com8.csv', index=False)

In [None]:
sub_df_18 = export_edges_df[export_edges_df['created_at'].dt.year == 2018] 
sub_df_19 = export_edges_df[export_edges_df['created_at'].dt.year == 2019]
sub_df_20 = export_edges_df[export_edges_df['created_at'].dt.year == 2020] 
sub_df_21 = export_edges_df[export_edges_df['created_at'].dt.year == 2021] 
sub_df_22 = export_edges_df[export_edges_df['created_at'].dt.year == 2022]  

In [None]:
sub_df_18.to_csv('FEdges18.csv', index=False)
sub_df_19.to_csv('FEdges19.csv', index=False)
sub_df_20.to_csv('FEdges20.csv', index=False)
sub_df_21.to_csv('FEdges21.csv', index=False)
sub_df_22.to_csv('FEdges22.csv', index=False)