In [10]:
import os 
import glob
import random 
import pandas as pd 
from tqdm import tqdm
from collections import defaultdict
from test_backbone import * 
from tree_metrics  import *
from annotations_processing import *

%load_ext autoreload
%autoreload 2

pd.options.mode.chained_assignment = None
random.seed(1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
outlets = ['rt_com', 'dailyherald', 'parentsmagazine', 'TheGoodGodAbove', 'bright_side_me', 'TheMarySue', 'ChinaDailyUSA', 'nbc6', 'MediaTakeoutTV', 'NationalMemo', 'newsandguts', 'theinquisitr', 'VoiceofPD', 'mercola', 'CBSLA', 'veteranstoday', 'NewsOn6', 'thedailybanter', 'unhealthytruth', 'Greg_Palast', 'lifebiomedguru', 'V_of_Europe', 'voguemagazine', 'TheOhioStar', 'MadWorldNews', 'PoliTribune', 'strange_sounds', 'realdennislynch', 'NatEnquirer', 'thrive', 'PanData19','ascienthusiast', 'BIZPACReview', 'FoxNews', 'drchrisnorthrup', 'healthychildren', 'NewsBecker', 'chicksonright', 'USATODAY', 'WayneDupreeShow', 'scarymommy', 'EpochTimes', 'ebonymag', 'NYDailyNews', 'twpundit', 'thetnstar', 'houstonpress', 'WGNRadio', 'nypost', 'tassagency_en', 'wearemitu', 'percolately', 'GeorgiaStarNews', 'mindys4Biden', 'esquire', 'KyivIndependent', 'digg', 'nra', 'voxdotcom', 'CNN', 'FDRLST', 'nytimes', 'BreitbartNews', 'KyivPost', 'SputnikInt']

In [12]:
ng = pd.read_csv('/Users/alessandroquattrociocchi/Documents/data/NewsGuard/Countries/USA_newsguard_handle.csv')

In [13]:
get_results_dict = defaultdict(list)
overall_unique_users = []
overall_toxic_authors = []
toxicity_threshold = 0.60
path = '/Users/alessandroquattrociocchi/Documents/Data/Twitter/comments_labelled_newsguard/comments_all_evaluated/'

tree_metrics = Metrics()

for outlet in (outlets):
    
    all_files = glob.glob(os.path.join(path + str(outlet) , "*.csv.xz"))
    for filename in tqdm(all_files):
        s = filename.split('/')
        tweet_id = s[-1][:-7]
        df = pd.read_csv(filename, index_col=None, header=0, low_memory=False,dtype=str)
        
        ##
        df = PrePreprocessing.adjust_columns_name(df)
        df = PrePreprocessing.preprocessing_df(df, filter_na = True)
        toxic_df = PrePreprocessing.filter_toxic_comments(df, tox_threshold = 0.6)
        root_node = PrePreprocessing.get_root(df)

        overall_unique_users += list(set(df.author_id.tolist() + df.in_reply_to_user_id.tolist()))
        overall_toxic_authors += toxic_df["author_id"].tolist()

        #retrieving the tweet identifier
        get_results_dict['tweet_id'].append(root_node)
        #retrieving the outlet's name
        get_results_dict['outlet_name'].append(outlet)
        #retrieving the outlet's score
        get_results_dict['outlet_score'].append(ng[ng['Twitter Handle'] == outlet].Score.values[0])
        get_results_dict['outlet_label'].append(PrePreprocessing.get_label(ng[ng['Twitter Handle'] == outlet].Score.values[0]))
        #retrieving the outlet's flag
        get_results_dict['outlet_flag'].append(ng[ng['Twitter Handle'] == outlet].Rating.values[0])
        #retrieving the unique users
        get_results_dict['unique_users'].append(len(set(df.author_id.tolist() + df.in_reply_to_user_id.tolist())))
        #retrieving the first comment datetime
        get_results_dict['created_at'].append(df.created_at.iloc[0])
        #get the total number of comments in the conversation thread -> tree size = number of nodes in the graph
        get_results_dict['n_comments'].append(len(df))
        #get the total number of toxic comments, i.e. the number of comments that exceed the threshold
        get_results_dict['n_tox_comments'].append(len(toxic_df))
        
        ##
        # compute the toxicity score
        get_results_dict['toxicity_ratio'].append(tree_metrics.get_toxicity_ratio(toxic_df, df))

        ##
        #building the edge list according to the algorithm presented
        vertices = list(set(df.id.tolist() + df.replied_id.tolist()))
        edge_list = tree_metrics.create_edge_list(vertices, root_node, df)
        gtree = tree_metrics.create_graph(vertices, edge_list)
        #setting toxicity values as attribute of the nodes
        gtree.vs['toxicity'] = df.toxicity_score.tolist()
        
        #Tree metrics: size, depth, wiener index, assortativity and average toxicity distance
        get_results_dict['size'].append(len(gtree.vs['name']))
        get_results_dict['depth'].append(tree_metrics.get_depth(gtree, root_node))
        get_results_dict['wiener_index'].append(tree_metrics.get_wiener_index(T = gtree, root_node = root_node))
        get_results_dict['assortativity_tox'].append(tree_metrics.get_assortativity(gtree, numeric_prop ='toxicity',directed_flag = False))
        get_results_dict['avg_tox_distance'].append(tree_metrics.mean_root_distance(gtree, toxic_df, root_node))
        get_results_dict['top_3_annotations'].append(ExtractAnnotations.get_top_annotations_from_thread(df))

100%|██████████| 1500/1500 [00:30<00:00, 49.40it/s]
100%|██████████| 7/7 [00:00<00:00, 76.95it/s]
100%|██████████| 17/17 [00:00<00:00, 57.98it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 60.15it/s]
100%|██████████| 70/70 [00:01<00:00, 60.18it/s]
100%|██████████| 1/1 [00:00<00:00, 49.18it/s]
100%|██████████| 3/3 [00:00<00:00, 62.65it/s]
100%|██████████| 94/94 [00:00<00:00, 95.99it/s] 
100%|██████████| 17/17 [00:00<00:00, 30.84it/s]
0it [00:00, ?it/s]
100%|██████████| 526/526 [00:11<00:00, 45.08it/s]
100%|██████████| 1497/1497 [00:41<00:00, 36.44it/s]
100%|██████████| 8/8 [00:00<00:00, 23.15it/s]
100%|██████████| 15/15 [00:00<00:00, 83.50it/s]
100%|██████████| 2/2 [00:00<00:00, 90.51it/s]
100%|██████████| 1919/1919 [00:40<00:00, 47.29it/s]
100%|██████████| 106/106 [00:03<00:00, 32.41it/s]
100%|██████████| 252/252 [00:04<00:00, 52.73it/s]
100%|██████████| 782/782 [00:08<00:00, 87.59it/s] 
100%|██████████| 785/785 [00:51<00:00, 15.27it/s]


In [14]:
xdf = pd.DataFrame(get_results_dict)
xdf.sort_values(by='created_at', inplace=True)
#xdf.to_csv('/Users/alessandroquattrociocchi/Git/free-speech-analysis/plots/thesis_data/full_metrics_thesis.csv', index=False)

In [17]:
xdf

Unnamed: 0,tweet_id,outlet_name,outlet_score,outlet_label,outlet_flag,unique_users,created_at,n_comments,n_tox_comments,toxicity_ratio,size,depth,wiener_index,assortativity_tox,avg_tox_distance,top_3_annotations
23609,1212168201355907072,BreitbartNews,49.5,mixed,N,75,2020-01-01 00:27:03+00:00,74,8,0.108108,77,2.0,2.049897,-0.259439,0.500000,"[Bloomberg Organization , White House Place , ..."
23261,1212181756679462912,nytimes,100.0,very_high,T,79,2020-01-01 01:21:32+00:00,120,6,0.050000,134,13.0,4.129054,0.066161,0.115385,"[Trump Person , Iraq Place , US Place ]"
10886,1212185171186929665,EpochTimes,49.5,mixed,N,13,2020-01-01 01:35:36+00:00,12,1,0.083333,13,1.0,1.846154,-0.233671,1.000000,"[New York Times Organization , democrats Organ..."
9531,1212186923244171266,chicksonright,69.5,high,T,22,2020-01-01 01:43:41+00:00,22,2,0.090909,24,2.0,1.992754,-0.287864,0.500000,"[US Place , Hollyweirdo Person , Liz Person ]"
12977,1212197271502303234,EpochTimes,49.5,mixed,N,71,2020-01-01 02:23:18+00:00,64,4,0.062500,74,2.0,2.262125,-0.242626,0.500000,"[Joe Person , joe Person , Hunter Person ]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24516,1512913030715260931,BreitbartNews,49.5,mixed,N,20,2022-04-09 22:00:00+00:00,21,1,0.047619,23,3.0,2.300395,0.266579,1.000000,"[United States Place , Kremlin Person , Americ..."
9924,1512913323586637837,WayneDupreeShow,7.5,very_low,N,8,2022-04-09 22:20:22+00:00,7,0,0.000000,8,1.0,1.750000,-0.037931,,[Wayne Person ]
24410,1512932408017965072,BreitbartNews,49.5,mixed,N,27,2022-04-09 23:16:40+00:00,28,1,0.035714,29,2.0,2.182266,-0.244981,0.500000,"[Bush Person , Hinckley Person , Jodie Foster ..."
9870,1512942095828402176,WayneDupreeShow,7.5,very_low,N,133,2022-04-09 23:55:19+00:00,135,5,0.037037,137,4.0,2.234435,-0.049188,0.250000,"[Trump Person , Oz Person , Dr Oz Person ]"


# Quali sono stati i topic che maggiormente hanno favorito il linguaggio d'odio?  

In [15]:
very_low_df = ExtractAnnotations.divide_dataframe_by_score(xdf, score_flag='very_low')

top_ann_very_low = ExtractAnnotations.get_annotations_per_year(very_low_df, threshold=0.1, top_n_annotations=10)

In [16]:
top_ann_very_low

{'2020': Index(['Trump Person ', 'America Place ', 'US Place ', 'Russia Place ',
        'China Place ', 'Biden Person ', 'God Other ', 'trump Person ',
        'Democrats Organization ', 'Federalist Person '],
       dtype='object'),
 '2021': Index(['Trump Person ', 'America Place ', 'Biden Person ', 'US Place ',
        'GOP Organization ', 'Democrats Organization ', 'Americans Person ',
        'God Other ', 'Republicans Organization ', 'Federalist Person '],
       dtype='object'),
 '2022': Index(['Russia Place ', 'Ukraine Place ', 'Putin Person ',
        'Sputnik Organization ', 'US Place ', 'Twitter Product ',
        'Moscow Place ', 'BLM Organization ', 'China Place ', 'PUTIN Person '],
       dtype='object')}