In [1]:
import os 
import glob
import random 
import pandas as pd 
from tqdm import tqdm
from collections import defaultdict
from test_backbone import * 
from tree_metrics  import *
from annotations_processing import *

%load_ext autoreload
%autoreload 2

pd.options.mode.chained_assignment = None
random.seed(1)

In [2]:
outlets = ['rt_com', 'dailyherald', 'parentsmagazine', 'TheGoodGodAbove', 'bright_side_me', 'TheMarySue', 'ChinaDailyUSA', 'nbc6', 'MediaTakeoutTV', 'NationalMemo', 'newsandguts', 'theinquisitr', 'VoiceofPD', 'mercola', 'CBSLA', 'veteranstoday', 'NewsOn6', 'thedailybanter', 'unhealthytruth', 'Greg_Palast', 'lifebiomedguru', 'V_of_Europe', 'voguemagazine', 'TheOhioStar', 'MadWorldNews', 'PoliTribune', 'strange_sounds', 'realdennislynch', 'NatEnquirer', 'thrive', 'PanData19','ascienthusiast', 'BIZPACReview', 'FoxNews', 'drchrisnorthrup', 'healthychildren', 'NewsBecker', 'chicksonright', 'USATODAY', 'WayneDupreeShow', 'scarymommy', 'EpochTimes', 'ebonymag', 'NYDailyNews', 'twpundit', 'thetnstar', 'houstonpress', 'WGNRadio', 'nypost', 'tassagency_en', 'wearemitu', 'percolately', 'GeorgiaStarNews', 'mindys4Biden', 'esquire', 'KyivIndependent', 'digg', 'nra', 'voxdotcom', 'CNN', 'FDRLST', 'nytimes', 'BreitbartNews', 'KyivPost', 'SputnikInt']

In [3]:
outlets = ['CNN']

In [4]:
ng = pd.read_csv('/Users/alessandroquattrociocchi/Documents/data/NewsGuard/Countries/USA_newsguard_handle.csv')

In [5]:
get_results_dict = defaultdict(list)
overall_unique_users = []
overall_toxic_authors = []
toxicity_threshold = 0.60
path = '/Users/alessandroquattrociocchi/Documents/Data/Twitter/comments_labelled_newsguard/comments_all_evaluated/'

tree_metrics = Metrics()

for outlet in (outlets):
    
    all_files = glob.glob(os.path.join(path + str(outlet) , "*.csv.xz"))
    for filename in tqdm(all_files):
        s = filename.split('/')
        tweet_id = s[-1][:-7]
        df = pd.read_csv(filename, index_col=None, header=0, low_memory=False,dtype=str)
        
        ##
        df = PrePreprocessing.adjust_columns_name(df)
        df = PrePreprocessing.preprocessing_df(df, filter_na = True)
        toxic_df = PrePreprocessing.filter_toxic_comments(df, tox_threshold = 0.6)
        root_node = PrePreprocessing.get_root(df)

        overall_unique_users += list(set(df.author_id.tolist() + df.in_reply_to_user_id.tolist()))
        overall_toxic_authors += toxic_df["author_id"].tolist()

        #retrieving the tweet identifier
        get_results_dict['tweet_id'].append(root_node)
        #retrieving the outlet's name
        get_results_dict['outlet_name'].append(outlet)
        #retrieving the outlet's score
        get_results_dict['outlet_score'].append(ng[ng['Twitter Handle'] == outlet].Score.values[0])
        get_results_dict['outlet_label'].append(PrePreprocessing.get_label(ng[ng['Twitter Handle'] == outlet].Score.values[0]))
        #retrieving the outlet's flag
        get_results_dict['outlet_flag'].append(ng[ng['Twitter Handle'] == outlet].Rating.values[0])
        #retrieving the unique users
        get_results_dict['unique_users'].append(len(set(df.author_id.tolist() + df.in_reply_to_user_id.tolist())))
        #retrieving the first comment datetime
        get_results_dict['created_at'].append(df.created_at.iloc[0])
        #get the total number of comments in the conversation thread -> tree size = number of nodes in the graph
        get_results_dict['n_comments'].append(len(df))
        #get the total number of toxic comments, i.e. the number of comments that exceed the threshold
        get_results_dict['n_tox_comments'].append(len(toxic_df))
        
        ##
        # compute the toxicity score
        get_results_dict['toxicity_ratio'].append(tree_metrics.get_toxicity_ratio(toxic_df, df))

        ##
        #building the edge list according to the algorithm presented
        vertices = list(set(df.id.tolist() + df.replied_id.tolist()))
        edge_list = tree_metrics.create_edge_list(vertices, root_node, df)
        gtree = tree_metrics.create_graph(vertices, edge_list)
        #setting toxicity values as attribute of the nodes
        gtree.vs['toxicity'] = df.toxicity_score.tolist()
        
        #Tree metrics: size, depth, wiener index, assortativity and average toxicity distance
        get_results_dict['size'].append(len(gtree.vs['name']))
        get_results_dict['depth'].append(tree_metrics.get_depth(gtree, root_node))
        get_results_dict['wiener_index'].append(tree_metrics.get_wiener_index(T = gtree, root_node = root_node))
        get_results_dict['assortativity_tox'].append(tree_metrics.get_assortativity(gtree, numeric_prop ='toxicity',directed_flag = False))
        get_results_dict['avg_tox_distance'].append(tree_metrics.mean_root_distance(gtree, toxic_df, root_node))
        get_results_dict['top_3_annotations'].append(ExtractAnnotations.get_top_annotations_from_thread(df))

 10%|█         | 157/1500 [00:07<00:37, 36.25it/s]

In [None]:
xdf = pd.DataFrame(get_results_dict)
xdf.sort_values(by='created_at', inplace=True)
#xdf.to_csv('/Users/alessandroquattrociocchi/Git/free-speech-analysis/plots/thesis_data/full_metrics_thesis.csv', index=False)

# Quali sono stati i topic che maggiormente hanno favorito il linguaggio d'odio?  

In [None]:
very_low_df = ExtractAnnotations.divide_dataframe_by_score(xdf, score_flag='very_low')

top_ann_very_low = ExtractAnnotations.get_annotations_per_year(very_low_df, threshold=0.1, top_n_annotations=10)

  annotations_dict['2020'] = pd.Series(list(itertools.chain.from_iterable(annotations20))).value_counts()[:top_n_annotations].index
  annotations_dict['2021'] = pd.Series(list(itertools.chain.from_iterable(annotations21))).value_counts()[:top_n_annotations].index
  annotations_dict['2022'] = pd.Series(list(itertools.chain.from_iterable(annotations22))).value_counts()[:top_n_annotations].index


In [None]:
top_ann_very_low

{'2020': Float64Index([], dtype='float64'),
 '2021': Float64Index([], dtype='float64'),
 '2022': Float64Index([], dtype='float64')}