In [1]:
import sys
import pandas as pd
import glob
import os 
from tqdm import tqdm
import lzma
import numpy as np
import ast
import pickle
import collections
from fake_outlets_list import fake_outlets_timelines

# Load all files TRUE or FALSE from domain folder


In [2]:
def load_all_timelines_from_source_folder(reliability):
    if reliability == True:
        outlets = ['immunizeorg','NewsOn6', 'KyivPost', 'KyivIndependent', 'CNN', 'FoxNews',
        'parentsmagazine', 'nbc6', 'CBSLA', 'esquire', 'TheMarySue', 'dailyherald', 'voxdotcom',
        'NYDailyNews', 'USATODAY','WIRED', 'NPR', 'nytimes', 'GayCityNews', 'IndianCountry', 'blackenterprise',
        'VoceroPR', 'abc15']
        path = '/Users/alessandroquattrociocchi/Documents/data/Twitter/timelines_labelled_newsguard/timelines_TRUE/'
    elif reliability == False:
        outlets = fake_outlets_timelines
        path = '/Users/alessandroquattrociocchi/Documents/data/Twitter/timelines_labelled_newsguard/timelines_FAKE/'

    comments_per_outlet = {}
    for outlet in tqdm(outlets):
        all_files = glob.glob(os.path.join(path + str(outlet) , "*.csv"))
        li = []
        for filename in all_files:
            df = pd.read_csv(filename, index_col=None, header=0, low_memory=False)
            
            li.append(df)
        comments_per_outlet[str(outlet)]  = li
    return comments_per_outlet, outlets


In [3]:
def preprocessing_page_timeline(df):
        #columns to keep in restricted dataset
        if 'referenced_tweets' in df.columns:
                cols_to_keep = ['text', 'id', 'created_at',
                        'referenced_tweets', 'lang',
                        'conversation_id', 'author_id', 
                        'public_metrics']
                
                df = df[cols_to_keep]
                #selecting retweets only
                df = df[df['referenced_tweets'].isnull()]
                #reset the index
                df = df.reset_index(drop=True)

                #adding new columns with tweets metrics
                df['retweet_count'] = np.nan
                df['reply_count'] = np.nan
                df['like_count'] = np.nan 
                df['quote_count'] = np.nan

                #extracting the metrics from the public metrics json
                for row in range(df.shape[0]):

                        tmp_dict_metrics = ast.literal_eval(df['public_metrics'][row])
                        retweet_count = tmp_dict_metrics['retweet_count']
                        reply_count = tmp_dict_metrics['reply_count']
                        like_count  = tmp_dict_metrics['like_count']
                        quote_count = tmp_dict_metrics['quote_count']

                        df.at[row, 'retweet_count'] = retweet_count
                        df.at[row, 'reply_count'] = reply_count
                        df.at[row, 'like_count'] = like_count
                        df.at[row, 'quote_count'] = quote_count
                return df
        else: 
                cols_to_keep = ['text', 'id', 'created_at','lang',
                'conversation_id', 'author_id', 
                'public_metrics']
                
                df = df[cols_to_keep]
                df = df.reset_index(drop=True)

                #adding new columns with tweets metrics
                df['retweet_count'] = np.nan
                df['reply_count'] = np.nan
                df['like_count'] = np.nan 
                df['quote_count'] = np.nan

                #extracting the metrics from the public metrics json
                for row in range(df.shape[0]):

                        tmp_dict_metrics = ast.literal_eval(df['public_metrics'][row])
                        retweet_count = tmp_dict_metrics['retweet_count']
                        reply_count = tmp_dict_metrics['reply_count']
                        like_count  = tmp_dict_metrics['like_count']
                        quote_count = tmp_dict_metrics['quote_count']

                        df.at[row, 'retweet_count'] = retweet_count
                        df.at[row, 'reply_count'] = reply_count
                        df.at[row, 'like_count'] = like_count
                        df.at[row, 'quote_count'] = quote_count
        return df 

In [4]:
def thresholding_cascade(df,lower_threshold, ascending=False):
        df = df[df['reply_count']>=lower_threshold]
        df = df.sort_values(by=['reply_count'],ascending=ascending)
        df = df.reset_index(drop=True)
        return df

In [5]:
def write_lzma_pickle(df, address):
    with lzma.open(address, "wb") as f:
        pickle.dump(df, f)
    print('pickle file correctly compressed and saved...')

Questa Pipeline importa tutte le timelines, ripulisce il campo delle reference e ordina il dataset prendendo tutti i post che hanno ricevuto un numero di commenti maggiori di 20.

In [6]:
#timelines_T, outlets = load_all_timelines_from_source_folder(reliability = True)

timelines_F, outlets = load_all_timelines_from_source_folder(reliability = False)

100%|██████████| 138/138 [00:15<00:00,  9.17it/s]


In [7]:
comments_dict = {}
total_comments = 0 

for outlet in tqdm(outlets):
    if len(timelines_F[outlet]) > 0:
        tmp_df = preprocessing_page_timeline(timelines_F[outlet][0])
        tmp_df = thresholding_cascade(tmp_df, lower_threshold=20)
        total_comments += tmp_df['reply_count'].sum()
        if tmp_df.shape[0] > 0:
            comments_dict[outlet] = tmp_df
print('Total Number of Comments: ', total_comments)


100%|██████████| 138/138 [00:41<00:00,  3.35it/s]

Total Number of Comments:  20380661.0





In [12]:
comments_dict['BreitbartNews'][:10]

Unnamed: 0,text,id,created_at,referenced_tweets,lang,conversation_id,author_id,public_metrics,retweet_count,reply_count,like_count,quote_count
0,".@Franklin_Graham: ""President Trump will go do...",1339028078069239809,2020-12-16T02:02:26.000Z,,en,1339028078069239809,457984599,"{'retweet_count': 8134, 'reply_count': 20334, ...",8134.0,20334.0,59476.0,2684.0
1,The elitist snobs in the fashion press have ke...,1342440129953202177,2020-12-25T12:00:42.000Z,,en,1342440129953202177,457984599,"{'retweet_count': 5844, 'reply_count': 16540, ...",5844.0,16540.0,32077.0,2395.0
2,Let's roll. https://t.co/lxmoxLEpEa,1325965921886867457,2020-11-10T00:58:05.000Z,,en,1325965921886867457,457984599,"{'retweet_count': 11061, 'reply_count': 8233, ...",11061.0,8233.0,65431.0,1873.0
3,Why wait? https://t.co/Icl16emcV2,1316394269503098880,2020-10-14T15:03:45.000Z,,en,1316394269503098880,457984599,"{'retweet_count': 1961, 'reply_count': 7703, '...",1961.0,7703.0,12672.0,1842.0
4,"Michelle Obama says she is ""dealing with some ...",1291078360504754176,2020-08-05T18:27:22.000Z,,en,1291078360504754176,457984599,"{'retweet_count': 935, 'reply_count': 7596, 'l...",935.0,7596.0,3937.0,2515.0
5,Tom Fitton: Americans weren’t generally aware ...,1310772464650260480,2020-09-29T02:44:43.000Z,,en,1310772464650260480,457984599,"{'retweet_count': 9021, 'reply_count': 6986, '...",9021.0,6986.0,26140.0,676.0
6,"""Please remain standing for the National Anthe...",1338586645151543298,2020-12-14T20:48:20.000Z,,en,1338586645151543298,457984599,"{'retweet_count': 1632, 'reply_count': 6804, '...",1632.0,6804.0,4186.0,3565.0
7,President Donald Trump has received more popul...,1325129980691558400,2020-11-07T17:36:21.000Z,,en,1325129980691558400,457984599,"{'retweet_count': 1408, 'reply_count': 6534, '...",1408.0,6534.0,5469.0,1439.0
8,Ouch. https://t.co/uwCChJoX2V,1325878050035298305,2020-11-09T19:08:55.000Z,,en,1325878050035298305,457984599,"{'retweet_count': 9168, 'reply_count': 6387, '...",9168.0,6387.0,49671.0,1997.0
9,Every American needs to see Joe Biden's latest...,1242143644917866498,2020-03-23T17:38:17.000Z,,en,1242143644917866498,457984599,"{'retweet_count': 8122, 'reply_count': 5420, '...",8122.0,5420.0,20960.0,2291.0


# Salvare il dataset di timelines in forma aggregata

In [9]:
# TRUE
write_lzma_pickle(comments_dict,'/Users/alessandroquattrociocchi/Desktop/timelines_aggregated_F.pickle.xz')

pickle file correctly compressed and saved...


In [10]:
# FAKE
#write_lzma_pickle(comments_dict,'/Users/alessandroquattrociocchi/Desktop/timelines_aggregated_F.pickle.xz')

In [11]:
l = ['immunizeorg','NewsOn6', 'KyivPost', 'KyivIndependent', 'CNN', 'FoxNews',
'parentsmagazine', 'nbc6', 'CBSLA', 'esquire', 'TheMarySue', 'dailyherald', 'voxdotcom',
'NYDailyNews', 'USATODAY','WIRED', 'NPR', 'nytimes', 'GayCityNews', 'IndianCountry', 'blackenterprise',
'VoceroPR', 'abc15', 'ascienthusiast', 'newsmax', 'BIZPACReview', 'drchrisnorthrup', 'healthychildren', 'NewsBecker', 'chicksonright', 'scarymommy', 'EpochTimes', 'ebonymag',
'GovMikeHuckabee', 'twpundit', 'TheGoodGodAbove', 'thetnstar', 'houstonpress', 'WGNRadio', 'nypost', 'tassagency_en', 'globaltimesnews', 'wearemitu', 'percolately',
'bright_side_me', 'GeorgiaStarNews', 'mindys4Biden', 'ChinaDailyUSA', 'MediaTakeoutTV', 'NationalMemo', 'newsandguts', 'theinquisitr', 'digg', 'VoiceofPD', 'nra',
'mercola', 'veteranstoday', 'thedailybanter', 'unhealthytruth', 'Greg_Palast', 'townhallcom', 'lifebiomedguru', 'V_of_Europe', 'voguemagazine', 'TheOhioStar',
'MadWorldNews', 'PoliTribune', 'strange_sounds', 'realdennislynch', 'NatEnquirer', 'thrive', 'PanData19', 'AliciaFixLuke', 'amerdailyindy', 'FDRLST', 'LiveAction', 'LifeNewsHQ', 'WayneDupreeShow']
