In [1]:
import json
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
from io import BytesIO
from tqdm import tqdm


In [99]:
def find_between(s, start, end):
    return (s.split(start))[1].split(end)[0]

def normalize_json(data: dict) -> dict:
  
    new_data = dict()
    for key, value in data.items():
        if not isinstance(value, dict):
            new_data[key] = value
        else:
            for k, v in value.items():
                new_data[key + "_" + k] = v
  
    return new_data
    
def read_files(filepath):
    """Reads file into csv

    Args:
        filepath (str): location of the file to read to csv
    """
    df = []
    ignore_file = '__MACOSX'
    with ZipFile(filepath, "r") as z:
        for filename in z.namelist()[1:]: #Skip root folder folder/
            if ignore_file not in filename:
                    # print(filename)  
                    with z.open(filename, 'r') as f:  
                        data = f.read()  
                        df.append(pd.json_normalize(json.loads(data))) 
                        
    df = pd.concat(df)
    df = df.set_index(df.columns[0]) # Set first column as index

    return df

def read_news_files(filepath, is_fake_news, type_of_file):
    """Extracts data from politifact_fake and politifact_real and converts to dataframe

    Args:
        filepath (str): Zip filepath
        is_fake_news (bool): Zip file classification
        type_of_file (str): subfolder name for the news article for users that have: retweet, likes, tweet, reply

    Returns:
        DataFrame: dataframe
    """
    df = []
    ignore_file = '__MACOSX'
    with ZipFile(filepath, "r") as zip:
        for zipname in tqdm(zip.namelist()[1:]):
            zfiledata = BytesIO(zip.read(zipname))

            with ZipFile(zfiledata, "r") as z:
                for filename in z.namelist(): #Skip root folder folder/
                    if re.search('^'+type_of_file+'/[a-zA-Z0-9].', filename):
                        with z.open(filename, 'r') as f:  
                            data = f.read()
                            # if type_of_file=='asd':
                            #     data = pd.json_normalize(json.loads(data))
                            #     data =data.rename(columns={'id':'user_id'})
                            # else:
                            try:
                                # data = pd.DataFrame(normalize_json(json.loads(data)))
                                data = normalize_json(json.loads(data))
                            except: 
                                print('error on a json')
                                continue
                            data['news_id'] = re.split("\.+", zipname)[0]
                            data['is_fake_news'] = is_fake_news
                            data['tweet_id'] = find_between(filename, '/','.')
                            df.append(data) 
    # df = pd.concat(df)
    df = pd.DataFrame(df)
    return df

def extract_id_from_json(json_col, type_of_file):
    id_list = []

    if type_of_file=='retweets':
        for jsons in tqdm(json_col):
            user_id = str(json.loads(json.dumps(jsons))['user']['id'])
            id_list.append(user_id)
            
    elif type_of_file=='replies':
        for jsons in tqdm(json_col):
            user_id = json.loads(json.dumps(jsons))['user_id']
            id_list.append(user_id)

    elif type_of_file=='likes':
        id_list = json_col
   
    elif type_of_file=='tweets':
        for jsons in tqdm(json_col):
            try:
                user_id = str(json.loads(json.dumps(jsons))['user']['id'])
            except:
                user_id = str('')
            id_list.append(user_id)

    return id_list

In [100]:
## Uncomment to process the original data. Takes a long time.

# df_followers = read_files('user_followers.zip')
# df_followers = df_followers.explode('followers')
# df_followers = df_followers.reset_index()

# df_following = read_files('user_following.zip')
# df_following = df_following.explode('followees')
# df_following = df_following.reset_index()

# df_user_profiles = read_files('user_profiles.zip')

# news_retweet_fake = read_news_files('politifact_fake.zip', is_fake_news=1, type_of_file='retweets')
# news_retweet_fake = news_retweet_fake[news_retweet_fake['retweets'].map(lambda d: len(d)) > 0]
# news_retweet_fake['retweets'] = news_retweet_fake['retweets'].map(lambda x: x[0])
# news_retweet_fake['user_id'] = extract_id_from_json(news_retweet_fake.retweets.tolist(), type_of_file='retweets')
# news_likes_fake = read_news_files('politifact_fake.zip', is_fake_news=1, type_of_file='likes')

# news_tweet_fake = read_news_files('politifact_fake.zip', is_fake_news=1, type_of_file='tweets')
# news_tweet_fake['user_id'] =  news_tweet_fake['user_id_str']
# news_tweet_fake = news_tweet_fake.drop(columns=['id_str','user_id_str'])

# news_reply_fake = read_news_files('politifact_fake.zip', is_fake_news=1, type_of_file='replies')

# news_retweet_real = read_news_files('politifact_real.zip', is_fake_news=0, type_of_file='retweets')
# news_retweet_real = news_retweet_real[news_retweet_real['retweets'].map(lambda d: len(d)) > 0]
# news_retweet_real['retweets'] = news_retweet_real['retweets'].map(lambda x: x[0])
# news_retweet_real['user_id'] = extract_id_from_json(news_retweet_real.retweets.tolist(), type_of_file='retweets')


# news_likes_real = read_news_files('politifact_real.zip', is_fake_news=0, type_of_file='likes')

# news_tweet_real = read_news_files('politifact_real.zip', is_fake_news=0, type_of_file='tweets')
# news_tweet_real['user_id'] =  news_tweet_real['user_id_str']
# news_tweet_real = news_tweet_real.drop(columns=['id_str','user_id_str'])

# news_reply_real = read_news_files('politifact_real.zip', is_fake_news=0, type_of_file='replies')

100%|██████████| 432/432 [00:26<00:00, 16.04it/s]


In [103]:
## Uncomment to save the processed unmerged csv

# df_followers.to_csv('data/df_followers.csv')
# df_following.to_csv('data/df_following.csv')
# df_user_profiles.to_csv('data/df_user_profiles.csv')
# news_retweet_fake.to_csv('data/news_retweet_fake.csv')
# news_likes_fake.to_csv('data/news_likes_fake.csv')
# news_tweet_fake.to_csv('data/news_tweet_fake.csv')
# news_reply_fake.to_csv('data/news_reply_fake.csv')
# news_retweet_real.to_csv('data/news_retweet_real.csv')
# news_likes_real.to_csv('data/news_likes_real.csv')
# news_tweet_real.to_csv('data/news_tweet_real.csv')
# news_reply_real.to_csv('data/news_reply_real.csv')

# news_retweet_fake.drop(columns=['retweets'], inplace=True)
# news_likes_fake.drop(columns=['likes'], inplace=True)
# news_reply_fake.drop(columns=['replies'], inplace=True)
# news_retweet_real.drop(columns=['retweets'], inplace=True)
# news_likes_real.drop(columns=['likes'], inplace=True)
# news_reply_real.drop(columns=['replies'], inplace=True)

# df_followers.to_csv('data_summary/df_followers.csv')
# df_following.to_csv('data_summary/df_following.csv')
# df_user_profiles.to_csv('data_summary/df_user_profiles.csv')
# news_retweet_fake.to_csv('data_summary/news_retweet_fake.csv')
# news_likes_fake.to_csv('data_summary/news_likes_fake.csv')
# news_tweet_fake.to_csv('data_summary/news_tweet_fake.csv')
# news_reply_fake.to_csv('data_summary/news_reply_fake.csv')
# news_retweet_real.to_csv('data_summary/news_retweet_real.csv')
# news_likes_real.to_csv('data_summary/news_likes_real.csv')
# news_tweet_real.to_csv('data_summary/news_tweet_real.csv')
# news_reply_real.to_csv('data_summary/news_reply_real.csv')

## Combine datasets to make network data

### Tweet-Retweet network

In [149]:
tweet_df = pd.concat([pd.read_csv('data_summary/news_tweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_tweet_fake.csv', index_col=0, dtype={'user_id':str})])
retweet_df = pd.concat([pd.read_csv('data_summary/news_retweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_retweet_fake.csv', index_col=0, dtype={'user_id':str})])

  tweet_df = pd.concat([pd.read_csv('data_summary/news_tweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_tweet_fake.csv', index_col=0, dtype={'user_id':str})])
  tweet_df = pd.concat([pd.read_csv('data_summary/news_tweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_tweet_fake.csv', index_col=0, dtype={'user_id':str})])


In [156]:
# Merge
tweet_retweet_network = retweet_df.merge(tweet_df[['tweet_id','user_id','is_fake_news', 'news_id']], left_on='tweet_id', right_on='tweet_id', how='outer', suffixes=['_retweet_df', '_tweet_df'])
tweet_retweet_network = tweet_retweet_network[(tweet_retweet_network['news_id_retweet_df']==tweet_retweet_network['news_id_tweet_df']) | (tweet_retweet_network['news_id_retweet_df'].isna()) | (tweet_retweet_network['news_id_tweet_df'].isna())]
tweet_retweet_network = tweet_retweet_network[(tweet_retweet_network['user_id_retweet_df']!=tweet_retweet_network['user_id_tweet_df'])] # Only keep rows where users are different
tweet_retweet_network = tweet_retweet_network[(tweet_retweet_network['is_fake_news_retweet_df']==tweet_retweet_network['is_fake_news_tweet_df']) | (tweet_retweet_network['is_fake_news_retweet_df'].isna()) | (tweet_retweet_network['is_fake_news_tweet_df'].isna())] # Only keep rows where labeling is equal or NA


tweet_retweet_network['news_id'] = tweet_retweet_network['news_id_tweet_df']
tweet_retweet_network['is_fake_news'] = tweet_retweet_network['is_fake_news_tweet_df']

tweet_retweet_network.drop(columns=['news_id_retweet_df','is_fake_news_retweet_df','news_id_tweet_df','is_fake_news_tweet_df'], inplace=True)
print(tweet_retweet_network.shape)
tweet_retweet_network.head()

(573637, 5)


Unnamed: 0,tweet_id,user_id_retweet_df,user_id_tweet_df,news_id,is_fake_news
0,1033706162695356417,787311228,43350851,politifact99,0.0
3,1033706162695356417,787311228,43350851,politifact340,0.0
4,1035580865160638464,3338246572,16297707,politifact99,0.0
7,1035580865160638464,3338246572,16297707,politifact340,0.0
8,934206237708865537,754310205546954757,5820642,politifact99,0.0


In [158]:
tweet_retweet_network.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 573637 entries, 0 to 586580
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tweet_id            573637 non-null  int64  
 1   user_id_retweet_df  89027 non-null   object 
 2   user_id_tweet_df    544027 non-null  object 
 3   news_id             573636 non-null  object 
 4   is_fake_news        573636 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 26.3+ MB


In [132]:
# Tweets per user
tweet_df[['tweet_id','user_id']].groupby('user_id').count().sort_values(by='tweet_id', ascending=False)

Unnamed: 0_level_0,tweet_id
user_id,Unnamed: 1_level_1
871466086293794817,1383
894687595527057415,1362
54039049,1243
2161036874,729
1044725409953136640,541
...,...
27368525,1
27368623,1
2736863183,1
2736870509,1


In [157]:
tweet_retweet_network.to_csv('data_summary/tweet_retweet_network.csv', index=False)

### Follower network


In [142]:
tweet_df = pd.concat([pd.read_csv('data_summary/news_tweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_tweet_fake.csv', index_col=0, dtype={'user_id':str})])
df_followers = pd.read_csv('data_summary/df_followers.csv', index_col=0, dtype={'user_id':str,'followers':str})

  tweet_df = pd.concat([pd.read_csv('data_summary/news_tweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_tweet_fake.csv', index_col=0, dtype={'user_id':str})])
  tweet_df = pd.concat([pd.read_csv('data_summary/news_tweet_real.csv', index_col=0, dtype={'user_id':str}), pd.read_csv('data_summary/news_tweet_fake.csv', index_col=0, dtype={'user_id':str})])


In [146]:
# Merge
follower_network = tweet_df[['tweet_id','user_id','is_fake_news', 'news_id']].merge(df_followers, left_on='user_id', right_on='user_id')
follower_network = follower_network.rename(columns={'followers':'followers_1'})
follower_network2 = follower_network.merge(df_followers, left_on='followers_1', right_on='user_id')[[]]
follower_network = follower_network.rename(columns={'followers':'followers_1'})



In [147]:
follower_network

Unnamed: 0,tweet_id,user_id_x,is_fake_news,news_id,followers_1,user_id_y,followers
0,907064409230643201,153218852,0,politifact99,316449565,316449565,965044995559477248
1,907064409230643201,153218852,0,politifact99,316449565,316449565,1053390064791760896
2,907064409230643201,153218852,0,politifact99,316449565,316449565,261045341
3,907064409230643201,153218852,0,politifact99,316449565,316449565,977013050518704128
4,907064409230643201,153218852,0,politifact99,316449565,316449565,3343757261
...,...,...,...,...,...,...,...
63633459,1034741192817295360,861663665719840773,1,politifact15514,3416678236,3416678236,3306283622
63633460,1034741192817295360,861663665719840773,1,politifact15514,3416678236,3416678236,4102376488
63633461,1034741192817295360,861663665719840773,1,politifact15514,3416678236,3416678236,4165642155
63633462,1034741192817295360,861663665719840773,1,politifact15514,3416678236,3416678236,384708626


### tweet_retweet_follower_network

In [None]:
df_followers = pd.read_csv('data_summary/df_followers.csv', index_col=0, dtype={'user_id':str,'followers':str})

In [219]:
# Retweets
tweet_retweet_follower_network = tweet_retweet_network[['user_id_tweet_df']]


# Followers of tweets
tweet_retweet_follower_network = tweet_retweet_follower_network.merge(df_followers, left_on = 'user_id_tweet_df', right_on = 'user_id')

tweet_retweet_follower_network.drop(columns=['user_id_tweet_df'], inplace=True)
tweet_retweet_follower_network.drop_duplicates(inplace=True)

# Followers of followers
followers_of_followers = tweet_retweet_follower_network[['followers']]
followers_of_followers = followers_of_followers.rename(columns={'followers':'user_id'})
followers_of_followers = followers_of_followers.merge(df_followers, left_on = 'user_id', right_on = 'user_id', suffixes=['_1','_2'])
# followers_of_followers.drop(columns=['user_id'], inplace=True)
followers_of_followers.drop_duplicates(inplace=True)

# Join FoF with FoT
tweet_retweet_follower_network = pd.concat([tweet_retweet_follower_network,followers_of_followers])
tweet_retweet_follower_network = tweet_retweet_follower_network.drop_duplicates()
tweet_retweet_follower_network['follow_or_retweet'] = tweet_retweet_follower_network['user_id']+tweet_retweet_follower_network['followers']
list = tweet_retweet_network['user_id_tweet_df']+tweet_retweet_network['user_id_tweet_df']
follow_and_retweet = [1 if x in list else 0 for x in tweet_retweet_follower_network['follow_or_retweet'] ]
tweet_retweet_follower_network['follow_or_retweet'] = follow_and_retweet
# Join with retweets
tweet_retweet_follower_network = pd.concat([tweet_retweet_network[['user_id_retweet_df','user_id_tweet_df','is_fake_news']].rename(columns={'user_id_retweet_df':'followers','user_id_tweet_df':'user_id'}), tweet_retweet_follower_network])
tweet_retweet_follower_network = tweet_retweet_follower_network[tweet_retweet_follower_network['follow_or_retweet']!=1]
tweet_retweet_follower_network.drop(columns=['follow_or_retweet'], inplace=True)
tweet_retweet_follower_network = tweet_retweet_follower_network.drop_duplicates(subset=['followers','user_id'])

import math
tweet_retweet_follower_network['follow_or_retweet'] = tweet_retweet_follower_network['is_fake_news'].map(lambda x: 'follower' if math.isnan(x) else 'retweet')
tweet_retweet_follower_network

tweet_retweet_follower_network

Unnamed: 0,followers,user_id,is_fake_news,follow_or_retweet
0,787311228,43350851,0.0,retweet
4,3338246572,16297707,0.0,retweet
8,754310205546954757,5820642,0.0,retweet
9,2157919340,1716121,0.0,retweet
13,4867759271,31056977,0.0,retweet
...,...,...,...,...
18258071,3306283622,3416678236,,follower
18258072,4102376488,3416678236,,follower
18258073,4165642155,3416678236,,follower
18258074,384708626,3416678236,,follower


In [220]:
tweet_retweet_follower_network.to_csv('data_summary/tweet_retweet_follower_network.csv', index=False)