In [1]:
# Import of libraries
import pandas as pd
import numpy as np
import datetime
import time
import json
import re
import glob
import os

# Definition of Functions

In [2]:
# Function returns dataframe consisting of the separate files in specified directory

def import_data_from_folder(path):

    csv_files = glob.glob(os.path.join(path, "*.pk"))
    frames = []
    
    # loop over the list of csv files
    for f in csv_files:

        # read the csv file
        df = pd.read_json(f)
        frames.append(df)
    
    # Concat DataFrame list to single DataFrame
    res_df = pd.concat(frames)
    res_df.reset_index(inplace=True)

    # Drop Index column which is also created while importing files
    res_df.drop(columns=['index'], inplace=True)
    
    return res_df

In [3]:
# Function returns a list containing all hashtags in the given list of JSON objects

def extract_hashtags(hashtag_ls:list):
    
    hashtags = [item.get('tag','') for item in hashtag_ls]
    
    return hashtags

# Importing Data

In [4]:
# Importing all files into dataframes
user_df= pd.read_pickle("./all_user_new.pkl")
tweet_df = pd.read_pickle("./all_tweets_new.pkl")
#tweet_df = pd.read_json("../Data/Tweets/Tweet_Data_#extinctionrebellion_20220622092130.json")
#user_df = pd.read_json("../Data/Users/User_Data_#extinctionrebellion_20220622092129.json")
#tweet_df = import_data_from_folder("../Data/Tweets/")
#ref_tweet_df = import_data_from_folder("../Data/Retweets/")
#user_df = import_data_from_folder("../Data/Users/")
#media_df = import_data_from_folder("../Data/Media/")
#place_df = import_data_from_folder("../Data/Place/")


# Preparing for Griffin Nodes

In [5]:
nodes = user_df.drop(['created_at', 'description','location', 'pinned_tweet_id', 'profile_image_url', 'protected', 'url', 'verified' ], axis = 1)
nodes = nodes.rename(columns = {'following_count': 'friends_count', 'tweet_count' : 'statuses_count'}, errors="raise")
nodes

Unnamed: 0,author_id,name,username,withheld,followers_count,friends_count,statuses_count,listed_count
0,1380145054644379652,man is human,EarthManat,,1234,20,14631,1
1,1425101738957283339,sacccay,sacccay,,272,282,43294,0
2,2817792555,WIE ICH FM,wieichfm,,211,1529,5861,40
3,1519629408256024578,Le Dernier Roi de Sarrebruck,betonmicha_,,0,31,4,0
4,1421341587649540097,Green Actions Senegal 🌳,Greenactions221,,1288,627,1191,17
...,...,...,...,...,...,...,...,...
630433,873797052,Yann Louvel,YannLouvel,,680,601,3493,23
630435,537191901,Simon Bush,SimRogBush,,1786,895,2535,23
630438,257538755,Rob Lake,roblake1959,,1960,544,4363,69
630439,927470194657759232,Stephen Hine,StephenHine8,,375,381,1400,8


# Preparing for Griffin Edges

In [6]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1124773 entries, 0 to 34526798
Data columns (total 14 columns):
 #   Column           Non-Null Count    Dtype              
---  ------           --------------    -----              
 0   author_id        1124773 non-null  int64              
 1   conversation_id  1124773 non-null  int64              
 2   created_at       1124773 non-null  datetime64[ns, UTC]
 3   entities         1123594 non-null  object             
 4   geo              47091 non-null    object             
 5   id               1124773 non-null  int64              
 6   text             1124773 non-null  object             
 7   withheld         145 non-null      object             
 8   retweet_count    1124773 non-null  int64              
 9   reply_count      1124773 non-null  int64              
 10  like_count       1124773 non-null  int64              
 11  quote_count      1124773 non-null  int64              
 12  media_keys       336735 non-null   object

Apparently there are some Tweets without entities (mentions). Those can be dropped, because they would be displayed as single points in the network. 

In [7]:
# Dropping Tweets without any entities / mentions
tweet_df.dropna(subset=['entities'], inplace=True)
len(tweet_df)

1123594

## Extracting Mentions and Hashtags

In [8]:
# Extract mentions and hashtags list from entities
tweet_df['mentions'] = tweet_df['entities'].apply(lambda entity: entity.get('mentions'))
tweet_df['hashtags_ls'] = tweet_df['entities'].apply(lambda entity: entity.get('hashtags'))
tweet_df

Unnamed: 0,author_id,conversation_id,created_at,entities,geo,id,text,withheld,retweet_count,reply_count,like_count,quote_count,media_keys,username,mentions,hashtags_ls
0,1380145054644379652,1539504187083456512,2022-06-22 07:02:55+00:00,"{'annotations': [{'start': 180, 'end': 193, 'p...",,1539504187083456512,ClimateCrisis ClimateAction GlobalWarming Frid...,,2,0,1,0,,EarthManat,,"[{'start': 0, 'end': 14, 'tag': 'ClimateCrisis..."
221,1380145054644379652,1539308556096184321,2022-06-21 18:05:33+00:00,"{'annotations': [{'start': 166, 'end': 179, 'p...",,1539308556096184321,ExtinctionRebellion ClimateAction ClimateCrisi...,,3,0,1,0,,EarthManat,,"[{'start': 0, 'end': 20, 'tag': 'ExtinctionReb..."
442,1380145054644379652,1539268828781334531,2022-06-21 15:27:41+00:00,"{'annotations': [{'start': 179, 'end': 192, 'p...",,1539268828781334531,Renewables Greenwashing ClimateStrike Exti...,,1,0,1,0,,EarthManat,,"[{'start': 0, 'end': 11, 'tag': 'Renewables'},..."
663,1380145054644379652,1539119110105321475,2022-06-21 05:32:45+00:00,"{'annotations': [{'start': 166, 'end': 179, 'p...",,1539119110105321475,ClimateAction ClimateCrisis GlobalWarming Fri...,,2,0,2,0,,EarthManat,,"[{'start': 0, 'end': 14, 'tag': 'ClimateAction..."
1547,1380145054644379652,1538821899299569665,2022-06-20 09:51:45+00:00,"{'annotations': [{'start': 180, 'end': 193, 'p...",,1538821899299569665,ClimateAction GlobalWarming FridaysForFuture C...,,2,0,1,0,,EarthManat,,"[{'start': 0, 'end': 14, 'tag': 'ClimateAction..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34526766,4827256263,1073599612697616384,2018-12-14 15:24:28+00:00,"{'hashtags': [{'start': 0, 'end': 15, 'tag': '...",{'place_id': 'c019b9c22419e1c3'},1073599612697616384,Fridays4Future https:tcoRUQEmDrJPg,,0,0,3,0,,RaumanVihreat,,"[{'start': 0, 'end': 15, 'tag': 'Fridays4Futur..."
34526767,2149327604,1073581344150704128,2018-12-14 14:11:53+00:00,"{'hashtags': [{'start': 167, 'end': 182, 'tag'...",,1073581344150704128,Its amazing how kids are standing strong toget...,,2,0,1,0,,TheForgeLaForge,,"[{'start': 167, 'end': 182, 'tag': 'Fridays4Fu..."
34526795,3315444739,1073342816624001025,2018-12-14 11:48:17+00:00,"{'mentions': [{'start': 11, 'end': 25, 'userna...",,1073545205570187264,@janine_ok @GretaThunberg @Fridays4future She ...,,1,0,1,0,,ForeverThenNow,"[{'start': 11, 'end': 25, 'username': 'GretaTh...",
34526797,79992130,1072490992257953792,2018-12-11 13:59:13+00:00,"{'mentions': [{'start': 10, 'end': 25, 'userna...",,1072490992257953792,Go Greta @Fridays4future May we all open our h...,,0,0,0,0,,Kelly_Lavelle,"[{'start': 10, 'end': 25, 'username': 'Fridays...",


In [9]:
# Drop all entries without mentions 
tweet_df.dropna(subset=['mentions'], inplace = True) 
tweet_df

Unnamed: 0,author_id,conversation_id,created_at,entities,geo,id,text,withheld,retweet_count,reply_count,like_count,quote_count,media_keys,username,mentions,hashtags_ls
241111,1380145054644379652,1413992980936110086,2021-09-05 20:30:48+00:00,"{'hashtags': [{'start': 15, 'end': 29, 'tag': ...",,1434615014820876290,@extinctsymbol ClimateChange GlobalWarming Upr...,,5,1,6,0,3_1434614678706135044,EarthManat,"[{'start': 0, 'end': 14, 'username': 'extincts...","[{'start': 15, 'end': 29, 'tag': 'ClimateChang..."
277576,1380145054644379652,1424786549716045826,2021-08-09 18:38:55+00:00,"{'hashtags': [{'start': 15, 'end': 32, 'tag': ...",,1424802388356050945,@ProtectThPass FridaysForFuture LithiumForFutu...,,1,0,1,0,3_1424801802206187523,EarthManat,"[{'start': 0, 'end': 14, 'username': 'ProtectT...","[{'start': 15, 'end': 32, 'tag': 'FridaysForFu..."
282438,1380145054644379652,1423662685141536770,2021-08-06 15:42:33+00:00,"{'mentions': [{'start': 0, 'end': 16, 'usernam...",,1423670839292178624,@StephenCorrySvl FridaysForFuture push for ren...,,4,1,6,0,,EarthManat,"[{'start': 0, 'end': 16, 'username': 'StephenC...",
298350,1380145054644379652,1417881812811063304,2021-07-21 16:31:34+00:00,"{'hashtags': [{'start': 263, 'end': 280, 'tag'...",,1417884969532264450,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,,1,1,1,0,,EarthManat,"[{'start': 0, 'end': 7, 'username': 'faznet', ...","[{'start': 263, 'end': 280, 'tag': 'FridaysFor..."
299013,1380145054644379652,1417783978925375489,2021-07-21 11:19:19+00:00,"{'hashtags': [{'start': 295, 'end': 312, 'tag'...",,1417806388852576257,@RinglerSchaff @FAZ_NET @wk_juergs @Luisamneub...,,0,0,2,0,,EarthManat,"[{'start': 0, 'end': 14, 'username': 'RinglerS...","[{'start': 295, 'end': 312, 'tag': 'FridaysFor..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34526764,711299241709277184,1071043066230923265,2018-12-14 18:00:36+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",,1073638903960223744,@svasterias @StrikeClimate @Fridays4future @Gr...,,0,0,0,0,,SecretzChannel,"[{'start': 0, 'end': 11, 'username': 'svasteri...",
34526765,617320528,1073484224672489473,2018-12-14 15:43:09+00:00,"{'hashtags': [{'start': 28, 'end': 42, 'tag': ...",{'place_id': '99cdab25eddd6bce'},1073604313551908864,@GretaThunberg You made it ClimateStrike Frida...,,0,0,2,0,,PeetWiel,"[{'start': 0, 'end': 14, 'username': 'GretaThu...","[{'start': 28, 'end': 42, 'tag': 'ClimateStrik..."
34526795,3315444739,1073342816624001025,2018-12-14 11:48:17+00:00,"{'mentions': [{'start': 11, 'end': 25, 'userna...",,1073545205570187264,@janine_ok @GretaThunberg @Fridays4future She ...,,1,0,1,0,,ForeverThenNow,"[{'start': 11, 'end': 25, 'username': 'GretaTh...",
34526797,79992130,1072490992257953792,2018-12-11 13:59:13+00:00,"{'mentions': [{'start': 10, 'end': 25, 'userna...",,1072490992257953792,Go Greta @Fridays4future May we all open our h...,,0,0,0,0,,Kelly_Lavelle,"[{'start': 10, 'end': 25, 'username': 'Fridays...",


In [10]:
# Drop all entries with same text / user (bots) 
test = tweet_df[tweet_df.duplicated(subset=['username', 'text'], keep=False)]
len(test)
tweet_df.drop_duplicates(subset=['text','username'], inplace = True)

In [11]:
# Expand all mentions in dataframe
tweet_df = tweet_df.explode(column='mentions')
tweet_df.reset_index(drop=True, inplace=True)

In [12]:
# Extract username and ids of mentioned users into Griffin edge attributes dst, dst_screen_name, dst_id_str
tweet_df['dst'] = tweet_df[tweet_df['mentions'].notna()]['mentions'].apply(lambda mention: mention.get('username',''))
tweet_df['dst_screen_name'] = tweet_df[tweet_df['mentions'].notna()]['mentions'].apply(lambda mention: mention.get('username',''))
tweet_df['dst_id_str'] = tweet_df[tweet_df['mentions'].notna()]['mentions'].apply(lambda mention: mention.get('id',''))

In [13]:
# Extract hashtags from tweets into list for each Tweet
tweet_df['hashtags'] = tweet_df[tweet_df['hashtags_ls'].notna()]['hashtags_ls'].apply(lambda ls: extract_hashtags(ls))
tweet_df

Unnamed: 0,author_id,conversation_id,created_at,entities,geo,id,text,withheld,retweet_count,reply_count,like_count,quote_count,media_keys,username,mentions,hashtags_ls,dst,dst_screen_name,dst_id_str,hashtags
0,1380145054644379652,1413992980936110086,2021-09-05 20:30:48+00:00,"{'hashtags': [{'start': 15, 'end': 29, 'tag': ...",,1434615014820876290,@extinctsymbol ClimateChange GlobalWarming Upr...,,5,1,6,0,3_1434614678706135044,EarthManat,"{'start': 0, 'end': 14, 'username': 'extinctsy...","[{'start': 15, 'end': 29, 'tag': 'ClimateChang...",extinctsymbol,extinctsymbol,522593098,"[ClimateChange, GlobalWarming, UprootTheSystem..."
1,1380145054644379652,1424786549716045826,2021-08-09 18:38:55+00:00,"{'hashtags': [{'start': 15, 'end': 32, 'tag': ...",,1424802388356050945,@ProtectThPass FridaysForFuture LithiumForFutu...,,1,0,1,0,3_1424801802206187523,EarthManat,"{'start': 0, 'end': 14, 'username': 'ProtectTh...","[{'start': 15, 'end': 32, 'tag': 'FridaysForFu...",ProtectThPass,ProtectThPass,1351964909920264203,"[FridaysForFuture, LithiumForFuture]"
2,1380145054644379652,1423662685141536770,2021-08-06 15:42:33+00:00,"{'mentions': [{'start': 0, 'end': 16, 'usernam...",,1423670839292178624,@StephenCorrySvl FridaysForFuture push for ren...,,4,1,6,0,,EarthManat,"{'start': 0, 'end': 16, 'username': 'StephenCo...",,StephenCorrySvl,StephenCorrySvl,1001498767525335040,
3,1380145054644379652,1417881812811063304,2021-07-21 16:31:34+00:00,"{'hashtags': [{'start': 263, 'end': 280, 'tag'...",,1417884969532264450,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,,1,1,1,0,,EarthManat,"{'start': 0, 'end': 7, 'username': 'faznet', '...","[{'start': 263, 'end': 280, 'tag': 'FridaysFor...",faznet,faznet,18047862,"[FridaysForFuture, KlimaKrise]"
4,1380145054644379652,1417881812811063304,2021-07-21 16:31:34+00:00,"{'hashtags': [{'start': 263, 'end': 280, 'tag'...",,1417884969532264450,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,,1,1,1,0,,EarthManat,"{'start': 8, 'end': 18, 'username': 'wk_juergs...","[{'start': 263, 'end': 280, 'tag': 'FridaysFor...",wk_juergs,wk_juergs,425677723,"[FridaysForFuture, KlimaKrise]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2386338,79992130,1072490992257953792,2018-12-11 13:59:13+00:00,"{'mentions': [{'start': 10, 'end': 25, 'userna...",,1072490992257953792,Go Greta @Fridays4future May we all open our h...,,0,0,0,0,,Kelly_Lavelle,"{'start': 10, 'end': 25, 'username': 'Fridays4...",,Fridays4future,Fridays4future,1053768884732547072,
2386339,2725679701,1071043066230923265,2018-12-07 17:52:13+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",,1071100079757389824,@svasterias @StrikeClimate @Fridays4future @Gr...,,0,0,2,0,,nakwat555,"{'start': 0, 'end': 11, 'username': 'svasteria...",,svasterias,svasterias,820311848771788800,
2386340,2725679701,1071043066230923265,2018-12-07 17:52:13+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",,1071100079757389824,@svasterias @StrikeClimate @Fridays4future @Gr...,,0,0,2,0,,nakwat555,"{'start': 12, 'end': 26, 'username': 'StrikeCl...",,StrikeClimate,StrikeClimate,1050599210012164101,
2386341,2725679701,1071043066230923265,2018-12-07 17:52:13+00:00,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",,1071100079757389824,@svasterias @StrikeClimate @Fridays4future @Gr...,,0,0,2,0,,nakwat555,"{'start': 27, 'end': 42, 'username': 'Fridays4...",,Fridays4future,Fridays4future,1053768884732547072,


## Dropping irrelevant columns (for Griffin Edges)

In [14]:
# Drop irrelevant columns for Griffin
edges_df = tweet_df.drop(columns=['entities',
               'id',
               #'referenced_tweets',
               #'lang',
               #'public_metrics',
               'conversation_id',
               #'in_reply_to_user_id',
               #'attachments',
               'geo',
               'withheld',
               'mentions',
               'hashtags_ls'], axis=1)

edges_df.rename(columns={'author_id':'src_id_str'}, inplace=True)
edges_df.head()

Unnamed: 0,src_id_str,created_at,text,retweet_count,reply_count,like_count,quote_count,media_keys,username,dst,dst_screen_name,dst_id_str,hashtags
0,1380145054644379652,2021-09-05 20:30:48+00:00,@extinctsymbol ClimateChange GlobalWarming Upr...,5,1,6,0,3_1434614678706135044,EarthManat,extinctsymbol,extinctsymbol,522593098,"[ClimateChange, GlobalWarming, UprootTheSystem..."
1,1380145054644379652,2021-08-09 18:38:55+00:00,@ProtectThPass FridaysForFuture LithiumForFutu...,1,0,1,0,3_1424801802206187523,EarthManat,ProtectThPass,ProtectThPass,1351964909920264203,"[FridaysForFuture, LithiumForFuture]"
2,1380145054644379652,2021-08-06 15:42:33+00:00,@StephenCorrySvl FridaysForFuture push for ren...,4,1,6,0,,EarthManat,StephenCorrySvl,StephenCorrySvl,1001498767525335040,
3,1380145054644379652,2021-07-21 16:31:34+00:00,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,1,1,1,0,,EarthManat,faznet,faznet,18047862,"[FridaysForFuture, KlimaKrise]"
4,1380145054644379652,2021-07-21 16:31:34+00:00,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,1,1,1,0,,EarthManat,wk_juergs,wk_juergs,425677723,"[FridaysForFuture, KlimaKrise]"


## Merging data sets to create griffin edges

In [15]:
# Create Dataframe containing the required fields for SRC nodes
src = nodes[['author_id',
               'username',
               'followers_count',
               'friends_count',
               'listed_count',
               'statuses_count']].copy()

src.rename(columns={
   'author_id':'src_id_str',
   'username':'src_name',
   'followers_count':'src_followers_count',
   'friends_count':'src_friends_count',
   'listed_count':'src_listed_count',
   'statuses_count':'src_statuses_count'
    }, inplace=True)

# Cleaning Nodes
src.sort_values(by=['src_id_str','src_statuses_count'], ascending=True, inplace=True)
src.drop_duplicates(subset='src_id_str', keep='last',inplace=True) 
src.reset_index(drop=True, inplace=True)

In [16]:
edges_df['src_id_str'] = edges_df['src_id_str'].astype(str)
src['src_id_str'] = src['src_id_str'].astype(str)
src.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409740 entries, 0 to 409739
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   src_id_str           409740 non-null  object
 1   src_name             409740 non-null  object
 2   src_followers_count  409740 non-null  int64 
 3   src_friends_count    409740 non-null  int64 
 4   src_listed_count     409740 non-null  int64 
 5   src_statuses_count   409740 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.8+ MB


In [17]:
# Merge dataframes to create SRC columns
edges_src = edges_df.merge(src, how='left', left_on='src_id_str', right_on='src_id_str')
edges_src

Unnamed: 0,src_id_str,created_at,text,retweet_count,reply_count,like_count,quote_count,media_keys,username,dst,dst_screen_name,dst_id_str,hashtags,src_name,src_followers_count,src_friends_count,src_listed_count,src_statuses_count
0,1380145054644379652,2021-09-05 20:30:48+00:00,@extinctsymbol ClimateChange GlobalWarming Upr...,5,1,6,0,3_1434614678706135044,EarthManat,extinctsymbol,extinctsymbol,522593098,"[ClimateChange, GlobalWarming, UprootTheSystem...",EarthManat,1234,20,1,14631
1,1380145054644379652,2021-08-09 18:38:55+00:00,@ProtectThPass FridaysForFuture LithiumForFutu...,1,0,1,0,3_1424801802206187523,EarthManat,ProtectThPass,ProtectThPass,1351964909920264203,"[FridaysForFuture, LithiumForFuture]",EarthManat,1234,20,1,14631
2,1380145054644379652,2021-08-06 15:42:33+00:00,@StephenCorrySvl FridaysForFuture push for ren...,4,1,6,0,,EarthManat,StephenCorrySvl,StephenCorrySvl,1001498767525335040,,EarthManat,1234,20,1,14631
3,1380145054644379652,2021-07-21 16:31:34+00:00,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,1,1,1,0,,EarthManat,faznet,faznet,18047862,"[FridaysForFuture, KlimaKrise]",EarthManat,1234,20,1,14631
4,1380145054644379652,2021-07-21 16:31:34+00:00,@faznet @wk_juergs 22 Gleichzeitig fördern sie...,1,1,1,0,,EarthManat,wk_juergs,wk_juergs,425677723,"[FridaysForFuture, KlimaKrise]",EarthManat,1234,20,1,14631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2386338,79992130,2018-12-11 13:59:13+00:00,Go Greta @Fridays4future May we all open our h...,0,0,0,0,,Kelly_Lavelle,Fridays4future,Fridays4future,1053768884732547072,,Kelly_Lavelle,72,238,3,143
2386339,2725679701,2018-12-07 17:52:13+00:00,@svasterias @StrikeClimate @Fridays4future @Gr...,0,0,2,0,,nakwat555,svasterias,svasterias,820311848771788800,,nakwat555,24,246,0,709
2386340,2725679701,2018-12-07 17:52:13+00:00,@svasterias @StrikeClimate @Fridays4future @Gr...,0,0,2,0,,nakwat555,StrikeClimate,StrikeClimate,1050599210012164101,,nakwat555,24,246,0,709
2386341,2725679701,2018-12-07 17:52:13+00:00,@svasterias @StrikeClimate @Fridays4future @Gr...,0,0,2,0,,nakwat555,Fridays4future,Fridays4future,1053768884732547072,,nakwat555,24,246,0,709


In [18]:
# Create Dataframe containing the required fields for DST nodes
dst = nodes[['author_id',
               'username'
               ]].copy()

dst.rename(columns={
   'author_id':'dst_id_str',
   'username':'dst_name'
    }, inplace=True)

dst['dst_id_str'] = dst['dst_id_str'].astype(str)

# Cleaning Nodes
dst.sort_values(by=['dst_id_str'], inplace=True)
dst.drop_duplicates(subset='dst_id_str', keep='last',inplace=True)
dst.reset_index(drop=True, inplace=True)

dst

Unnamed: 0,dst_id_str,dst_name
0,1000001291945500672,hiddlvsbatch
1,1000001295342858241,ACF_EU
2,1000002112238039040,Loucif3r_
3,1000002274536689664,MinimalerNils3
4,1000002497363283969,netspring
...,...,...
409735,99999491,Tanisha_RR
409736,999996798793109506,MariannaLaReina
409737,999996985578020864,sascha303_
409738,999999578127650816,B0RdErZ_


In [19]:
# Merge dataframes to create DST columns
edges_df = edges_src.merge(dst, how='inner', left_on='dst_id_str', right_on='dst_id_str')
edges_df

Unnamed: 0,src_id_str,created_at,text,retweet_count,reply_count,like_count,quote_count,media_keys,username,dst,dst_screen_name,dst_id_str,hashtags,src_name,src_followers_count,src_friends_count,src_listed_count,src_statuses_count,dst_name
0,1380145054644379652,2021-09-05 20:30:48+00:00,@extinctsymbol ClimateChange GlobalWarming Upr...,5,1,6,0,3_1434614678706135044,EarthManat,extinctsymbol,extinctsymbol,522593098,"[ClimateChange, GlobalWarming, UprootTheSystem...",EarthManat,1234,20,1,14631,extinctsymbol
1,1380145054644379652,2021-07-11 17:01:22+00:00,ExtinctionRebellion ClimateEmergency ClimateCr...,2,0,1,0,,EarthManat,extinctsymbol,extinctsymbol,522593098,"[ExtinctionRebellion, ClimateEmergency, Climat...",EarthManat,1234,20,1,14631,extinctsymbol
2,1048180379847258112,2021-12-17 11:15:00+00:00,ClimateStrike Week17\nOnce Earth was full of B...,28,3,47,3,3_1471790930470400002,Nasadox_,extinctsymbol,extinctsymbol,522593098,"[ClimateStrike, ClimateActionNow, FridaysForFu...",Nasadox_,1041,1175,11,470,extinctsymbol
3,1244306606441070594,2021-12-31 20:13:32+00:00,@extinctsymbol Id rather we make 2022 the year...,2,1,0,0,7_1448600355844116480,TheCartHorse1,extinctsymbol,extinctsymbol,522593098,"[MarchForTheUNcharter, FridaysForFuture]",TheCartHorse1,5247,5773,12,125150,extinctsymbol
4,1244306606441070594,2021-01-03 14:57:12+00:00,@KillTheCorpora @GretaThunberg @Fridays4future...,2,1,1,0,,TheCartHorse1,extinctsymbol,extinctsymbol,522593098,[FridaysForFuture],TheCartHorse1,5247,5773,12,125150,extinctsymbol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340026,775034466536452097,2018-12-28 18:28:22+00:00,@craigzimmerman6 @AlexandriaV2005 @UN @GretaTh...,0,0,2,0,,meg_Y12,craigzimmerman6,craigzimmerman6,2371297874,,meg_Y12,15351,15031,28,234724,craigzimmerman6
2340027,2371297874,2018-12-28 18:24:47+00:00,@meg_Y12 @AlexandriaV2005 @UN @GretaThunberg @...,0,1,0,0,,craigzimmerman6,meg_Y12,meg_Y12,775034466536452097,,craigzimmerman6,1672,4982,28,46993,meg_Y12
2340028,85175625,2018-12-23 06:30:32+00:00,The transformation of the planet that lies ahe...,4,0,4,0,3_1076726580586602497,jornbettin,Center4NewEcon,Center4NewEcon,1077599426,[AutisticCollaboration],jornbettin,2313,1945,75,12982,Center4NewEcon
2340029,85175625,2018-12-23 06:30:32+00:00,The transformation of the planet that lies ahe...,4,0,4,0,3_1076726580586602497,jornbettin,CulturalEvolSoc,CulturalEvolSoc,3837815958,[AutisticCollaboration],jornbettin,2313,1945,75,12982,CulturalEvolSoc


In [20]:
edges_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2340031 entries, 0 to 2340030
Data columns (total 19 columns):
 #   Column               Dtype              
---  ------               -----              
 0   src_id_str           object             
 1   created_at           datetime64[ns, UTC]
 2   text                 object             
 3   retweet_count        int64              
 4   reply_count          int64              
 5   like_count           int64              
 6   quote_count          int64              
 7   media_keys           object             
 8   username             object             
 9   dst                  object             
 10  dst_screen_name      object             
 11  dst_id_str           object             
 12  hashtags             object             
 13  src_name             object             
 14  src_followers_count  int64              
 15  src_friends_count    int64              
 16  src_listed_count     int64              
 17  src_stat

In [21]:
datetime.timedelta(seconds=24*60*60).total_seconds()

86400.0

In [22]:
edges_df['time'] = edges_df['created_at'].apply(lambda x: int(time.mktime(x.timetuple())))

In [23]:
edges_df['src'] = edges_df['src_name']

In [24]:
# Reorder Columns for Griffin Format
edges_df[[
'src_followers_count',
'src_friends_count',
'src_listed_count',
'src_statuses_count',
'src_name',
'src_id_str',
'text',
'created_at',
'hashtags',
'dst_name',
'dst_id_str',
'time',
'src',
'dst']]


Unnamed: 0,src_followers_count,src_friends_count,src_listed_count,src_statuses_count,src_name,src_id_str,text,created_at,hashtags,dst_name,dst_id_str,time,src,dst
0,1234,20,1,14631,EarthManat,1380145054644379652,@extinctsymbol ClimateChange GlobalWarming Upr...,2021-09-05 20:30:48+00:00,"[ClimateChange, GlobalWarming, UprootTheSystem...",extinctsymbol,522593098,1630866648,EarthManat,extinctsymbol
1,1234,20,1,14631,EarthManat,1380145054644379652,ExtinctionRebellion ClimateEmergency ClimateCr...,2021-07-11 17:01:22+00:00,"[ExtinctionRebellion, ClimateEmergency, Climat...",extinctsymbol,522593098,1626015682,EarthManat,extinctsymbol
2,1041,1175,11,470,Nasadox_,1048180379847258112,ClimateStrike Week17\nOnce Earth was full of B...,2021-12-17 11:15:00+00:00,"[ClimateStrike, ClimateActionNow, FridaysForFu...",extinctsymbol,522593098,1639736100,Nasadox_,extinctsymbol
3,5247,5773,12,125150,TheCartHorse1,1244306606441070594,@extinctsymbol Id rather we make 2022 the year...,2021-12-31 20:13:32+00:00,"[MarchForTheUNcharter, FridaysForFuture]",extinctsymbol,522593098,1640978012,TheCartHorse1,extinctsymbol
4,5247,5773,12,125150,TheCartHorse1,1244306606441070594,@KillTheCorpora @GretaThunberg @Fridays4future...,2021-01-03 14:57:12+00:00,[FridaysForFuture],extinctsymbol,522593098,1609682232,TheCartHorse1,extinctsymbol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340026,15351,15031,28,234724,meg_Y12,775034466536452097,@craigzimmerman6 @AlexandriaV2005 @UN @GretaTh...,2018-12-28 18:28:22+00:00,,craigzimmerman6,2371297874,1546018102,meg_Y12,craigzimmerman6
2340027,1672,4982,28,46993,craigzimmerman6,2371297874,@meg_Y12 @AlexandriaV2005 @UN @GretaThunberg @...,2018-12-28 18:24:47+00:00,,meg_Y12,775034466536452097,1546017887,craigzimmerman6,meg_Y12
2340028,2313,1945,75,12982,jornbettin,85175625,The transformation of the planet that lies ahe...,2018-12-23 06:30:32+00:00,[AutisticCollaboration],Center4NewEcon,1077599426,1545543032,jornbettin,Center4NewEcon
2340029,2313,1945,75,12982,jornbettin,85175625,The transformation of the planet that lies ahe...,2018-12-23 06:30:32+00:00,[AutisticCollaboration],CulturalEvolSoc,3837815958,1545543032,jornbettin,CulturalEvolSoc


# Exporting Edges and Nodes

In [25]:
edges_df['created_at'].min()

Timestamp('2018-09-08 13:44:24+0000', tz='UTC')

In [26]:
edges_df['created_at'].dt.year.max()

2022

In [27]:
nodes

Unnamed: 0,author_id,name,username,withheld,followers_count,friends_count,statuses_count,listed_count
0,1380145054644379652,man is human,EarthManat,,1234,20,14631,1
1,1425101738957283339,sacccay,sacccay,,272,282,43294,0
2,2817792555,WIE ICH FM,wieichfm,,211,1529,5861,40
3,1519629408256024578,Le Dernier Roi de Sarrebruck,betonmicha_,,0,31,4,0
4,1421341587649540097,Green Actions Senegal 🌳,Greenactions221,,1288,627,1191,17
...,...,...,...,...,...,...,...,...
630433,873797052,Yann Louvel,YannLouvel,,680,601,3493,23
630435,537191901,Simon Bush,SimRogBush,,1786,895,2535,23
630438,257538755,Rob Lake,roblake1959,,1960,544,4363,69
630439,927470194657759232,Stephen Hine,StephenHine8,,375,381,1400,8


In [28]:
len(edges_df)

2340031

In [29]:
# Export Nodes as csv
nodes.to_csv('nodes.csv', index=False)
edges_df.to_csv('unfiltered_edges.csv', index=False)

# Filter Nodes based on number of Edges  

1. wie oft kommt node in den edges vor 
2. alle nodes unter 20 edges droppen 
3. alle edges wo nodes drinne sind die nicht in nodes sind droppen 

In [30]:
edges_df_filtered = edges_df.groupby('dst_id_str').count()
edges_df_filtered = edges_df_filtered[edges_df_filtered['src_id_str']>=200]
edges_df_filtered_complete = edges_df[edges_df['dst_id_str'].isin(edges_df_filtered.index)]
edges_df_filtered_complete

Unnamed: 0,src_id_str,created_at,text,retweet_count,reply_count,like_count,quote_count,media_keys,username,dst,...,dst_id_str,hashtags,src_name,src_followers_count,src_friends_count,src_listed_count,src_statuses_count,dst_name,time,src
0,1380145054644379652,2021-09-05 20:30:48+00:00,@extinctsymbol ClimateChange GlobalWarming Upr...,5,1,6,0,3_1434614678706135044,EarthManat,extinctsymbol,...,522593098,"[ClimateChange, GlobalWarming, UprootTheSystem...",EarthManat,1234,20,1,14631,extinctsymbol,1630866648,EarthManat
1,1380145054644379652,2021-07-11 17:01:22+00:00,ExtinctionRebellion ClimateEmergency ClimateCr...,2,0,1,0,,EarthManat,extinctsymbol,...,522593098,"[ExtinctionRebellion, ClimateEmergency, Climat...",EarthManat,1234,20,1,14631,extinctsymbol,1626015682,EarthManat
2,1048180379847258112,2021-12-17 11:15:00+00:00,ClimateStrike Week17\nOnce Earth was full of B...,28,3,47,3,3_1471790930470400002,Nasadox_,extinctsymbol,...,522593098,"[ClimateStrike, ClimateActionNow, FridaysForFu...",Nasadox_,1041,1175,11,470,extinctsymbol,1639736100,Nasadox_
3,1244306606441070594,2021-12-31 20:13:32+00:00,@extinctsymbol Id rather we make 2022 the year...,2,1,0,0,7_1448600355844116480,TheCartHorse1,extinctsymbol,...,522593098,"[MarchForTheUNcharter, FridaysForFuture]",TheCartHorse1,5247,5773,12,125150,extinctsymbol,1640978012,TheCartHorse1
4,1244306606441070594,2021-01-03 14:57:12+00:00,@KillTheCorpora @GretaThunberg @Fridays4future...,2,1,1,0,,TheCartHorse1,extinctsymbol,...,522593098,[FridaysForFuture],TheCartHorse1,5247,5773,12,125150,extinctsymbol,1609682232,TheCartHorse1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267900,611984073,2022-03-28 08:05:50+00:00,@vatmarker @klimatbevakaren @jordensvanner @An...,0,1,10,0,,barludde,AnderssonMagda,...,499854916,,barludde,353,548,3,20172,AnderssonMagda,1648447550,barludde
2267901,1060847293228859392,2022-03-28 10:17:01+00:00,@barludde @vatmarker @klimatbevakaren @jordens...,0,1,2,0,,IsabelleLetell1,AnderssonMagda,...,499854916,,IsabelleLetell1,5424,4651,81,23090,AnderssonMagda,1648455421,IsabelleLetell1
2267902,1060847293228859392,2022-03-28 09:59:43+00:00,@barludde @vatmarker @klimatbevakaren @jordens...,0,2,1,0,,IsabelleLetell1,AnderssonMagda,...,499854916,,IsabelleLetell1,5424,4651,81,23090,AnderssonMagda,1648454383,IsabelleLetell1
2267903,1060847293228859392,2022-03-28 09:28:55+00:00,@barludde @vatmarker @klimatbevakaren @jordens...,0,3,0,0,,IsabelleLetell1,AnderssonMagda,...,499854916,,IsabelleLetell1,5424,4651,81,23090,AnderssonMagda,1648452535,IsabelleLetell1


In [31]:
df = edges_df_filtered_complete.groupby('dst_id_str').count()
#df = edges_df_filtered_src_complete.groupby('dst_id_str').count()
df.describe()

Unnamed: 0,src_id_str,created_at,text,retweet_count,reply_count,like_count,quote_count,media_keys,username,dst,dst_screen_name,hashtags,src_name,src_followers_count,src_friends_count,src_listed_count,src_statuses_count,dst_name,time,src
count,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0
mean,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475,262.795545,1202.379475,1202.379475,1202.379475,557.588703,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475,1202.379475
std,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433,1745.26655,7612.201433,7612.201433,7612.201433,3181.306923,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433,7612.201433
min,200.0,200.0,200.0,200.0,200.0,200.0,200.0,0.0,200.0,200.0,200.0,0.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
25%,271.0,271.0,271.0,271.0,271.0,271.0,271.0,42.0,271.0,271.0,271.0,104.0,271.0,271.0,271.0,271.0,271.0,271.0,271.0,271.0
50%,399.0,399.0,399.0,399.0,399.0,399.0,399.0,80.0,399.0,399.0,399.0,219.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0
75%,760.0,760.0,760.0,760.0,760.0,760.0,760.0,200.0,760.0,760.0,760.0,416.0,760.0,760.0,760.0,760.0,760.0,760.0,760.0,760.0
max,195076.0,195076.0,195076.0,195076.0,195076.0,195076.0,195076.0,46138.0,195076.0,195076.0,195076.0,91201.0,195076.0,195076.0,195076.0,195076.0,195076.0,195076.0,195076.0,195076.0


In [32]:
edges_df_filtered_src = edges_df.groupby('src_id_str').count()
edges_df_filtered_src = edges_df_filtered_src[edges_df_filtered_src['dst_id_str']>=200]
edges_df_filtered_src_complete = edges_df_filtered_complete[edges_df_filtered_complete['src_id_str'].isin(edges_df_filtered_src.index)]
edges_df_filtered_src_complete

Unnamed: 0,src_id_str,created_at,text,retweet_count,reply_count,like_count,quote_count,media_keys,username,dst,...,dst_id_str,hashtags,src_name,src_followers_count,src_friends_count,src_listed_count,src_statuses_count,dst_name,time,src
2,1048180379847258112,2021-12-17 11:15:00+00:00,ClimateStrike Week17\nOnce Earth was full of B...,28,3,47,3,3_1471790930470400002,Nasadox_,extinctsymbol,...,522593098,"[ClimateStrike, ClimateActionNow, FridaysForFu...",Nasadox_,1041,1175,11,470,extinctsymbol,1639736100,Nasadox_
3,1244306606441070594,2021-12-31 20:13:32+00:00,@extinctsymbol Id rather we make 2022 the year...,2,1,0,0,7_1448600355844116480,TheCartHorse1,extinctsymbol,...,522593098,"[MarchForTheUNcharter, FridaysForFuture]",TheCartHorse1,5247,5773,12,125150,extinctsymbol,1640978012,TheCartHorse1
4,1244306606441070594,2021-01-03 14:57:12+00:00,@KillTheCorpora @GretaThunberg @Fridays4future...,2,1,1,0,,TheCartHorse1,extinctsymbol,...,522593098,[FridaysForFuture],TheCartHorse1,5247,5773,12,125150,extinctsymbol,1609682232,TheCartHorse1
6,1403598307,2019-10-04 11:12:23+00:00,@extinctsymbol CLIMATESTRIKE SYDNEY Friday Se...,1,0,0,0,,nicolasnicola22,extinctsymbol,...,522593098,"[CLIMATESTRIKE, SYDNEY, FridaysForFuture, clim...",nicolasnicola22,1540,98,398,198503,extinctsymbol,1570180343,nicolasnicola22
7,372151755,2022-04-23 12:57:53+00:00,@extinctsymbol yes carfree gocarfree istayonth...,1,0,1,0,,mogulc,extinctsymbol,...,522593098,"[carfree, gocarfree, istayontheground, bikedon...",mogulc,354,1522,2,3141,extinctsymbol,1650711473,mogulc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267733,873814437601447936,2022-04-05 09:11:09+00:00,@vatmarker @Supermiljoblogg @klimataktion @Fri...,0,0,2,0,,knastergast,AnderssonMagda,...,499854916,,knastergast,1081,2721,7,28959,AnderssonMagda,1649142669,knastergast
2267901,1060847293228859392,2022-03-28 10:17:01+00:00,@barludde @vatmarker @klimatbevakaren @jordens...,0,1,2,0,,IsabelleLetell1,AnderssonMagda,...,499854916,,IsabelleLetell1,5424,4651,81,23090,AnderssonMagda,1648455421,IsabelleLetell1
2267902,1060847293228859392,2022-03-28 09:59:43+00:00,@barludde @vatmarker @klimatbevakaren @jordens...,0,2,1,0,,IsabelleLetell1,AnderssonMagda,...,499854916,,IsabelleLetell1,5424,4651,81,23090,AnderssonMagda,1648454383,IsabelleLetell1
2267903,1060847293228859392,2022-03-28 09:28:55+00:00,@barludde @vatmarker @klimatbevakaren @jordens...,0,3,0,0,,IsabelleLetell1,AnderssonMagda,...,499854916,,IsabelleLetell1,5424,4651,81,23090,AnderssonMagda,1648452535,IsabelleLetell1


In [33]:
sub_df_18 = edges_df_filtered_src_complete[edges_df_filtered_src_complete['created_at'].dt.year == 2018] 
sub_df_19 = edges_df_filtered_src_complete[edges_df_filtered_src_complete['created_at'].dt.year == 2019]
sub_df_20 = edges_df_filtered_src_complete[edges_df_filtered_src_complete['created_at'].dt.year == 2020] 
sub_df_21 = edges_df_filtered_src_complete[edges_df_filtered_src_complete['created_at'].dt.year == 2021] 
sub_df_22 = edges_df_filtered_src_complete[edges_df_filtered_src_complete['created_at'].dt.year == 2022]  

In [34]:
edges_df_filtered_src_complete.to_csv('edges_filtered.csv', index=False)
sub_df_18.to_csv('FEdges18.csv', index=False)
sub_df_19.to_csv('FEdges19.csv', index=False)
sub_df_20.to_csv('FEdges20.csv', index=False)
sub_df_21.to_csv('FEdges21.csv', index=False)
sub_df_22.to_csv('FEdges22.csv', index=False)

In [None]:
nodes.head()