In [1]:
import os
import json
import time
import numpy as np
import pandas as pd

import pyarrow
import pyarrow.parquet as pq

from tqdm.notebook import tqdm

from googleapiclient import discovery
from googleapiclient.errors import HttpError

from data_io import *
from prepare_data import *

In [2]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
attribute_list = ['SEVERE_TOXICITY', 'TOXICITY', 'IDENTITY_ATTACK', 'INSULT', 
                  'PROFANITY', 'THREAT', 'SEXUALLY_EXPLICIT', 'FLIRTATION']

In [3]:
en_df = pq.read_table(os.path.join(data_dir, 'en_df.parquet')).to_pandas()
en_df.shape

(40000, 3)

In [4]:
en_df.head()

Unnamed: 0,author_id,tweet,label
0,043e2766cc6d22ae4e447ca5f2885a2a,Fuck New York #URL#,1
1,043e2766cc6d22ae4e447ca5f2885a2a,#USER# #USER# I think I'm in love,1
2,043e2766cc6d22ae4e447ca5f2885a2a,Trump is awesome #URL#,1
3,043e2766cc6d22ae4e447ca5f2885a2a,#USER# You have the greatest tweets sweetheart...,1
4,043e2766cc6d22ae4e447ca5f2885a2a,"#USER# It's free pizza Hun, just free food",1


In [5]:
tweets = en_df['tweet'].tolist()
tweets = filter_tweets(tweets, lowercase=False)

authors = prepare_authlist(en_df['author_id'].tolist())
len(tweets), len(authors), len(attribute_list)

(40000, 200, 8)

In [6]:
API_KEY = 'AIzaSyAKEKvKdrXhcBil0Chq3itSnobj6nBqVSU'
API_KEY

'AIzaSyAKEKvKdrXhcBil0Chq3itSnobj6nBqVSU'

In [7]:
def get_perspective_client():
    client = discovery.build(
        'commentanalyzer',
        'v1alpha1',
        developerKey = API_KEY,
        discoveryServiceUrl = 'https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1',
        static_discovery = False
    )
    
    return client

client = get_perspective_client()

In [8]:
def prepare_attribute_labels_for_spanish(attribute_list):
    experimental_attributes = ['IDENTITY_ATTACK', 'INSULT', 'PROFANITY', 'THREAT']
    not_avail_attributes = ['SEXUALLY_EXPLICIT', 'FLIRTATION']
    new_attributes = list()
    
    for attr in attribute_list:
        if attr not in not_avail_attributes:            
            if attr in experimental_attributes:
                new_attributes.append(attr + '_EXPERIMENTAL')
            else:
                new_attributes.append(attr)
            
    return new_attributes

prepare_attribute_labels_for_spanish(attribute_list)

['SEVERE_TOXICITY',
 'TOXICITY',
 'IDENTITY_ATTACK_EXPERIMENTAL',
 'INSULT_EXPERIMENTAL',
 'PROFANITY_EXPERIMENTAL',
 'THREAT_EXPERIMENTAL']

In [9]:
def get_perspective_scores_for_comment(client, text, attributes, language):
    requested_attributes = dict()
    
    if language == 'es':
        attributes = prepare_attribute_labels_for_spanish(attributes)
    
    for attr in attributes:
        requested_attributes[attr] = dict()
        requested_attributes[attr]['scoreType'] = 'PROBABILITY'
        requested_attributes[attr]['scoreThreshold'] = 0.0
    
    analyze_request = {
        'comment': {
            'text': text,
            'type': 'PLAIN_TEXT'
        },
        'requestedAttributes': requested_attributes,
        'languages': [language],
        'doNotStore': True
    }

    response = client.comments().analyze(body=analyze_request).execute()
    output_scores = list()
    
    for attr in attributes:
        output_scores.append(response['attributeScores'][attr]['summaryScore']['value'])
    output_scores = np.array(output_scores)
    
    return output_scores

comment = 'friendly greetings from python, now fuck you lol'
scores = get_perspective_scores_for_comment(client=client, text=comment, attributes=attribute_list, language='en')
scores

array([0.8064043 , 0.9480856 , 0.26663992, 0.734651  , 0.95862216,
       0.4150672 , 0.7893632 , 0.61857504])

In [10]:
def combine_and_get_perspective_scores(client, tweets, steps, attribute_list, lang, halt_time=1):
    n = len(tweets)
    step_size = int(n / steps)
    scores = list()
    
    for s in range(steps):
        ind = s * step_size
        step_tweets = tweets[ind : ind+step_size]
        combined_tweet = ' '.join(step_tweets)
        
        step_scores = get_perspective_scores_for_comment(client, combined_tweet, attribute_list, lang)
        time.sleep(halt_time)
        
        scores.append(step_scores)
        
    return scores

scores1, scores2 = combine_and_get_perspective_scores(client, tweets[:200], steps=2,
                                                      attribute_list=attribute_list, lang='en')
scores1, scores2

(array([0.8486256 , 0.88824683, 0.8600546 , 0.895227  , 0.9265979 ,
        0.7581372 , 0.88350433, 0.7423529 ]),
 array([0.88629097, 0.93708044, 0.7437222 , 0.942489  , 0.9541912 ,
        0.708491  , 0.8474576 , 0.650735  ]))

In [39]:
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n//10%10!=1)*(n%10<4)*n%10::4])
ordinal(4)

'4th'

In [46]:
def collect_perspective_scores(authors, tweets, attribute_list, lang, steps=20, halt_time=1):
    n = len(authors)
    
    df_columns = [ordinal(i+1) for i in range(steps)]
    scores_df = pd.DataFrame(index=authors, columns=df_columns)    
    
    client = get_perspective_client()
    
    for i in tqdm(range(n)):
        start = i*200
        try:
            step_scores = combine_and_get_perspective_scores(
                client, tweets[start : start+200], steps=steps,
                attribute_list=attribute_list, lang=lang, halt_time=halt_time)
        except HttpError as err:
            if str(err).find('Quota exceeded') > -1:
                print('Quota exceeded, halting for 2 min ....')
                time.sleep(120)
                print('Resuming service ....')
                
                step_scores = combine_and_get_perspective_scores(
                    client, tweets[start : start+200], steps=steps,
                    attribute_list=attribute_list, lang=lang, halt_time=2*halt_time)

            elif str(err).find('Comment text was too many bytes.') > -1:
                step_step_scores = combine_and_get_perspective_scores(
                    client, tweets[start : start+200], 
                    steps=2*steps, attribute_list=attribute_list, lang=lang, halt_time=halt_time)
                
                step_scores = list()
                for k in range(0, 2*steps, 2):
                    s1 = step_step_scores[k]
                    s2 = step_step_scores[k+1]
                    s = np.maximum(s1, s2)
                    step_scores.append(s)
                                
        for j in range(steps):
            scores_df.loc[authors[i], df_columns[j]] = step_scores[j]
        
    return scores_df

scores_df = collect_perspective_scores(authors[:2], tweets[:400], attribute_list=attribute_list, lang='en')
scores_df

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th,11th,12th,13th,14th,15th,16th,17th,18th,19th,20th
0035a3060d075506f5b9b978a910aa1f,"[0.7720432, 0.7758766, 0.62421805, 0.734651, 0...","[0.67047954, 0.70770013, 0.4832601, 0.64531505...","[0.28759164, 0.38455915, 0.40176675, 0.3479256...","[0.30843088, 0.3446915, 0.39856848, 0.33818233...","[0.36292765, 0.45601103, 0.4101707, 0.4742807,...","[0.38778928, 0.40324634, 0.34681928, 0.3101104...","[0.5056421, 0.54957163, 0.58523935, 0.5676359,...","[0.5116584, 0.5380188, 0.6082535, 0.5392099, 0...","[0.6654899, 0.6933736, 0.5645213, 0.64188564, ...","[0.3690786, 0.38028604, 0.38957348, 0.29204777...","[0.5956171, 0.7188183, 0.3710072, 0.68785954, ...","[0.6914786, 0.72622824, 0.6866913, 0.7160134, ...","[0.29362434, 0.38333434, 0.46419477, 0.3864112...","[0.68487024, 0.7417715, 0.60678005, 0.6045247,...","[0.31678766, 0.4031544, 0.4375782, 0.39012516,...","[0.5039542, 0.5314379, 0.42656603, 0.50431955,...","[0.65954137, 0.67190486, 0.6281916, 0.660731, ...","[0.58454615, 0.6389125, 0.813082, 0.6511785, 0...","[0.5424295, 0.6364068, 0.4250893, 0.6567349, 0...","[0.732805, 0.74516535, 0.8008965, 0.7831149, 0..."
00c1418fce0e39063eee22ec3e5179ec,"[0.26453677, 0.27825636, 0.3929555, 0.26884308...","[0.32534984, 0.3700093, 0.40937057, 0.34748325...","[0.55158395, 0.62858665, 0.6207266, 0.6869784,...","[0.4183646, 0.51657027, 0.42527074, 0.370317, ...","[0.6470379, 0.71999276, 0.688637, 0.64821833, ...","[0.35080814, 0.43650666, 0.4026303, 0.31536314...","[0.22991377, 0.28986102, 0.33109003, 0.2951461...","[0.3062412, 0.357817, 0.3158862, 0.30127966, 0...","[0.62785465, 0.7157248, 0.42644167, 0.57936835...","[0.5202308, 0.55547756, 0.58523935, 0.48292056...","[0.45998403, 0.4725994, 0.51396626, 0.48341003...","[0.38168204, 0.4307239, 0.4146178, 0.30556637,...","[0.15962079, 0.19198523, 0.33953568, 0.1996326...","[0.40388632, 0.43018353, 0.4431528, 0.4211461,...","[0.40692574, 0.43191385, 0.57656044, 0.4354249...","[0.5546326, 0.6283048, 0.72697806, 0.6944822, ...","[0.2934576, 0.32661867, 0.35959294, 0.2928756,...","[0.30914468, 0.3347669, 0.321175, 0.28163093, ...","[0.37003204, 0.4031903, 0.35716957, 0.3601305,...","[0.51240265, 0.5598796, 0.47973466, 0.4548482,..."


In [13]:
# en_perspective_scores_1 = collect_perspective_scores(authors[0:50], tweets[0*200:50*200], attribute_list, 
#                                                      lang='en', halt_time=1)
# en_perspective_scores_1.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [15]:
# en_perspective_scores_2 = collect_perspective_scores(authors[50:100], tweets[50*200:100*200], attribute_list, 
#                                                      lang='en', halt_time=0.7)
# en_perspective_scores_2.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [17]:
# en_perspective_scores_3 = collect_perspective_scores(authors[100:150], tweets[100*200:150*200], attribute_list, 
#                                                      lang='en', halt_time=0.7)
# en_perspective_scores_3.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [18]:
# en_perspective_scores_4 = collect_perspective_scores(authors[150:200], tweets[150*200:200*200], attribute_list, 
#                                                      lang='en', halt_time=0.7)
# en_perspective_scores_4.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [30]:
# en_perspective_scores = pd.concat(
#     (en_perspective_scores_1, en_perspective_scores_2, en_perspective_scores_3, en_perspective_scores_4))
# en_perspective_scores.shape

(200, 20)

In [31]:
# en_perspective_scores.head()

Unnamed: 0,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th,11th,12th,13th,14th,15th,16th,17th,18th,19th,20th
043e2766cc6d22ae4e447ca5f2885a2a,"[0.7638062, 0.8606264, 0.3912061, 0.7970631, 0...","[0.87427866, 0.93232936, 0.49263316, 0.9383387...","[0.8466297, 0.8935496, 0.8921792, 0.87720364, ...","[0.8490961, 0.94992846, 0.58523935, 0.95808834...","[0.6603118, 0.7329791, 0.56534106, 0.734651, 0...","[0.69607526, 0.77590674, 0.8008965, 0.8213792,...","[0.84895986, 0.9211528, 0.6749164, 0.8956145, ...","[0.7580121, 0.8428421, 0.38629955, 0.68785954,...","[0.8020613, 0.8785311, 0.7180557, 0.8453147, 0...","[0.69478977, 0.8785311, 0.42012486, 0.9091764,...","[0.8416842, 0.94992846, 0.30924588, 0.9626704,...","[0.8416842, 0.9131554, 0.7262056, 0.895227, 0....","[0.77522963, 0.8606264, 0.4136869, 0.895227, 0...","[0.8360954, 0.8912007, 0.58418703, 0.8457096, ...","[0.7570049, 0.8793836, 0.53978914, 0.9091764, ...","[0.8382448, 0.8901554, 0.85807014, 0.9091764, ...","[0.76381993, 0.90173465, 0.62590176, 0.9338795...","[0.8416842, 0.9178798, 0.58523935, 0.895227, 0...","[0.8416842, 0.8785311, 0.8921792, 0.88075686, ...","[0.88724667, 0.94992846, 0.60822374, 0.942489,..."
06893abba0bb8f94fed7562350233ed7,"[0.13567817, 0.19501047, 0.2923241, 0.12882288...","[0.08397212, 0.119783536, 0.21523024, 0.134365...","[0.24582219, 0.36352384, 0.6271969, 0.38566154...","[0.19823803, 0.25707558, 0.4018703, 0.2510756,...","[0.68433094, 0.69542736, 0.8264823, 0.6356477,...","[0.27819535, 0.37564182, 0.5722838, 0.3669941,...","[0.06258979, 0.15815501, 0.1438973, 0.19868493...","[0.24409193, 0.31089434, 0.4521896, 0.29815647...","[0.65782034, 0.69542736, 0.77115285, 0.7019946...","[0.13070925, 0.20278536, 0.39151874, 0.1570818...","[0.24460183, 0.32900724, 0.54866326, 0.3344697...","[0.46209514, 0.5634692, 0.76403475, 0.6356477,...","[0.27004248, 0.4032475, 0.53238416, 0.36286297...","[0.34914795, 0.4456197, 0.63262814, 0.38069126...","[0.47163448, 0.5725183, 0.78480315, 0.5027966,...","[0.37159792, 0.46397626, 0.6350989, 0.4292296,...","[0.24524134, 0.31089434, 0.45559555, 0.2900097...","[0.22878365, 0.31089434, 0.39374894, 0.2821979...","[0.25926167, 0.35078368, 0.5333142, 0.31020233...","[0.38449043, 0.46479547, 0.62191415, 0.4876976..."
0a3ce42bea89e2a92a28f685735e605e,"[0.7042307, 0.81055385, 0.86248994, 0.88075686...","[0.6826267, 0.7792435, 0.87621015, 0.734651, 0...","[0.30925795, 0.53490317, 0.5861173, 0.64572185...","[0.34911436, 0.49474093, 0.707714, 0.4506255, ...","[0.17076196, 0.19738834, 0.26711276, 0.1907166...","[0.13513605, 0.31089434, 0.13855198, 0.3540402...","[0.1407635, 0.29193908, 0.2994997, 0.33165616,...","[0.27304062, 0.4282019, 0.6039103, 0.42848337,...","[0.53273445, 0.61866987, 0.8008965, 0.6301576,...","[0.38520372, 0.5876557, 0.58523935, 0.660731, ...","[0.22985509, 0.4285567, 0.22347814, 0.4488336,...","[0.24753945, 0.37904406, 0.2513022, 0.41806275...","[0.099255376, 0.17428601, 0.07972292, 0.141405...","[0.42420036, 0.58199257, 0.72995174, 0.5873429...","[0.28393394, 0.40312752, 0.53845096, 0.3425190...","[0.40814704, 0.60474813, 0.6300597, 0.5837193,...","[0.44577324, 0.53490317, 0.698391, 0.518069, 0...","[0.20933463, 0.27734217, 0.22447824, 0.2368295...","[0.5233043, 0.6047101, 0.8008965, 0.63499856, ...","[0.20622422, 0.35344788, 0.57832354, 0.3092566..."
0a6700c6023c6249bcc5820e2f5ee0de,"[0.89362836, 0.93232936, 0.95459044, 0.9323444...","[0.7317537, 0.76876384, 0.8424161, 0.79041666,...","[0.9197861, 0.95651215, 0.9653199, 0.9481788, ...","[0.76388687, 0.84309494, 0.8422688, 0.87689155...","[0.76393366, 0.794729, 0.91835845, 0.8059788, ...","[0.86663723, 0.930754, 0.9419227, 0.9091764, 0...","[0.9393468, 0.9590299, 0.9660949, 0.96008027, ...","[0.8741936, 0.92079043, 0.953994, 0.92305475, ...","[0.93098795, 0.9591509, 0.95402986, 0.9481788,...","[0.87427866, 0.93233377, 0.9352267, 0.9091764,...","[0.7638499, 0.8606264, 0.92455935, 0.8496918, ...","[0.89945644, 0.9357672, 0.9547644, 0.93234444,...","[0.8590148, 0.9189394, 0.92215836, 0.90904963,...","[0.89362836, 0.9296117, 0.95631635, 0.9089759,...","[0.87427866, 0.89614266, 0.9445409, 0.90904224...","[0.8416842, 0.8870531, 0.9411386, 0.895227, 0....","[0.87427866, 0.9248923, 0.9429178, 0.91870445,...","[0.9393468, 0.94992846, 0.9651932, 0.9481788, ...","[0.9004059, 0.93242764, 0.95402986, 0.9341467,...","[0.89362836, 0.9480856, 0.9448219, 0.9253436, ..."
0d02a3f644c9313315ecc6655ccfa3b9,"[0.6568953, 0.7108109, 0.8929873, 0.6956019, 0...","[0.76379895, 0.78395367, 0.92455935, 0.7945978...","[0.764759, 0.799822, 0.92455935, 0.80613285, 0...","[0.8416842, 0.8785311, 0.95402986, 0.879425, 0...","[0.6835485, 0.78857404, 0.9040422, 0.80613285,...","[0.73218995, 0.769541, 0.9047819, 0.80595934, ...","[0.76381856, 0.80310655, 0.9358519, 0.74853384...","[0.70340794, 0.74443483, 0.8930021, 0.734651, ...","[0.5977498, 0.69542736, 0.8632592, 0.7110012, ...","[0.6814334, 0.74713576, 0.9047819, 0.76738805,...","[0.61279494, 0.65276694, 0.8624928, 0.6668961,...","[0.63748103, 0.69542736, 0.8631113, 0.70114475...","[0.7722939, 0.8544898, 0.93795425, 0.88075686,...","[0.8486256, 0.90182865, 0.9500094, 0.9235945, ...","[0.80791426, 0.8606264, 0.92294437, 0.895227, ...","[0.87484336, 0.91536015, 0.95402986, 0.9091764...","[0.6789188, 0.7467408, 0.9047819, 0.734651, 0....","[0.8011641, 0.8294156, 0.9352267, 0.8760719, 0...","[0.89362836, 0.93232936, 0.9430493, 0.93234444...","[0.8486256, 0.931167, 0.96805567, 0.93234444, ..."


In [32]:
# en_table = pyarrow.Table.from_pandas(en_perspective_scores)
# pq.write_table(en_table, os.path.join(data_dir, 'en_perspective_scores_20.parquet'))

In [20]:
es_df = pq.read_table(os.path.join(data_dir, 'es_df.parquet')).to_pandas()
es_df.shape

(40000, 3)

In [21]:
es_df.head()

Unnamed: 0,author_id,tweet,label
0,0035a3060d075506f5b9b978a910aa1f,#USER# pasta con bichos de agua,0
1,0035a3060d075506f5b9b978a910aa1f,De verdad puto lol de mierda qué asco de juego...,0
2,0035a3060d075506f5b9b978a910aa1f,RT #USER#: me hice una pcr y ya tengo los resu...,0
3,0035a3060d075506f5b9b978a910aa1f,"Y un lomo queso de baguette entera, tranqui #URL#",0
4,0035a3060d075506f5b9b978a910aa1f,Me cambio de curro y me llegan 3 ofertas direc...,0


In [22]:
tweets = es_df['tweet'].tolist()
authors = prepare_authlist(es_df['author_id'].tolist())
len(tweets), len(authors), len(attribute_list)

(40000, 200, 8)

In [24]:
# es_perspective_scores_1 = collect_perspective_scores(authors[0:50], tweets[0*200:50*200], attribute_list, 
#                                                      lang='es', halt_time=0.7)
# es_perspective_scores_1.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [25]:
# es_perspective_scores_2 = collect_perspective_scores(authors[50:100], tweets[50*200:100*200], attribute_list, 
#                                                      lang='es', halt_time=0.7)
# es_perspective_scores_2.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [26]:
# es_perspective_scores_3 = collect_perspective_scores(authors[100:150], tweets[100*200:150*200], attribute_list, 
#                                                      lang='es', halt_time=0.7)
# es_perspective_scores_3.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [27]:
# es_perspective_scores_4 = collect_perspective_scores(authors[150:200], tweets[150*200:200*200], attribute_list, 
#                                                      lang='es', halt_time=0.7)
# es_perspective_scores_4.shape

  0%|          | 0/50 [00:00<?, ?it/s]

(50, 20)

In [33]:
# es_perspective_scores = pd.concat(
#     (es_perspective_scores_1, es_perspective_scores_2, es_perspective_scores_3, es_perspective_scores_4))
# es_perspective_scores.shape

(200, 20)

In [34]:
# es_perspective_scores.head()

Unnamed: 0,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th,11th,12th,13th,14th,15th,16th,17th,18th,19th,20th
0035a3060d075506f5b9b978a910aa1f,"[0.84578586, 0.8103466, 0.48857453, 0.7406555,...","[0.73219585, 0.7216203, 0.38971007, 0.64922506...","[0.64140385, 0.6148493, 0.64994794, 0.5439803,...","[0.13112165, 0.090696946, 0.20398536, 0.087929...","[0.32466528, 0.3100549, 0.31929135, 0.21316645...","[0.70190024, 0.5772601, 0.6801018, 0.52926695,...","[0.84624016, 0.8134399, 0.6672078, 0.8369582, ...","[0.887, 0.87062573, 0.8572327, 0.8838457, 0.94...","[0.8142734, 0.7832541, 0.56886077, 0.77600086,...","[0.5185071, 0.4904714, 0.48279938, 0.38487494,...","[0.8787185, 0.87006974, 0.5591265, 0.7915732, ...","[0.8092042, 0.6988898, 0.640752, 0.66641134, 0...","[0.4419923, 0.5355338, 0.38971007, 0.43899506,...","[0.80018216, 0.76157147, 0.43312222, 0.6772958...","[0.84624016, 0.78252614, 0.76722693, 0.7404679...","[0.4202294, 0.5126151, 0.2819738, 0.3814522, 0...","[0.887, 0.89834154, 0.6150715, 0.8994134, 0.97...","[0.46044177, 0.41501662, 0.5943694, 0.43641984...","[0.9393093, 0.92420757, 0.61728394, 0.9292263,...","[0.93806, 0.9240819, 0.8063158, 0.91980004, 0...."
00c1418fce0e39063eee22ec3e5179ec,"[0.28196722, 0.25016078, 0.4760394, 0.2613653,...","[0.28196722, 0.20308973, 0.25975698, 0.1117435...","[0.73219585, 0.71948063, 0.38971007, 0.6959486...","[0.13358954, 0.10921866, 0.2887967, 0.06852475...","[0.6572104, 0.54599214, 0.66609454, 0.5626418,...","[0.20246385, 0.13562119, 0.2753793, 0.10929312...","[0.53063244, 0.40986502, 0.75352806, 0.3484180...","[0.53063244, 0.5514374, 0.6672078, 0.5358881, ...","[0.45647702, 0.44318923, 0.21789187, 0.2410147...","[0.50416183, 0.46774587, 0.54806954, 0.3433741...","[0.34846792, 0.2619559, 0.48586312, 0.2756512,...","[0.20246385, 0.13718398, 0.48857453, 0.1052739...","[0.16364901, 0.10050835, 0.31929135, 0.0831883...","[0.46371466, 0.31458744, 0.37718487, 0.2284214...","[0.20583463, 0.14273375, 0.4783197, 0.11048249...","[0.68065315, 0.5370227, 0.8224907, 0.4761374, ...","[0.13112165, 0.07091247, 0.084612966, 0.057424...","[0.07481416, 0.047224812, 0.08774634, 0.035190...","[0.06681233, 0.046990495, 0.13039842, 0.031016...","[0.70190024, 0.69415885, 0.38971007, 0.5626418..."
017dda89bbe1c2eac6fa75596b6a80f9,"[0.8444065, 0.77061665, 0.8063158, 0.78330505,...","[0.887, 0.85981286, 0.8073804, 0.85902256, 0.8...","[0.80607814, 0.8133253, 0.5983437, 0.82316566,...","[0.8826929, 0.8578459, 0.8572327, 0.8716805, 0...","[0.84624016, 0.7790317, 0.8063158, 0.76788914,...","[0.11423506, 0.25023085, 0.38971007, 0.3048980...","[0.8098634, 0.77363473, 0.5378186, 0.76788914,...","[0.624502, 0.534077, 0.5356536, 0.5220468, 0.7...","[0.4202294, 0.5057463, 0.6395722, 0.5358881, 0...","[0.6530991, 0.52800053, 0.5354809, 0.5246005, ...","[0.3039941, 0.42225134, 0.5983437, 0.46067, 0....","[0.354325, 0.5816195, 0.38971007, 0.6622303, 0...","[0.56127584, 0.64408785, 0.5597331, 0.6848057,...","[0.42689446, 0.50547695, 0.3200384, 0.51967835...","[0.3516836, 0.26814008, 0.5541417, 0.23660722,...","[0.6572104, 0.6818591, 0.38971007, 0.75175196,...","[0.13112165, 0.06111935, 0.20093524, 0.0454105...","[0.624502, 0.5041009, 0.5556876, 0.4482993, 0....","[0.8820656, 0.87062573, 0.6375556, 0.89928204,...","[0.46526358, 0.64408785, 0.3440822, 0.7051636,..."
05637e07406ea8532afc02fb024f9301,"[0.28817728, 0.19968636, 0.38971007, 0.1927046...","[0.6572104, 0.29722732, 0.5796163, 0.29344314,...","[0.20246385, 0.19189163, 0.5401675, 0.1834041,...","[0.34500942, 0.4081173, 0.307916, 0.2756512, 0...","[0.8419014, 0.7409685, 0.29711923, 0.7069018, ...","[0.32324103, 0.31844237, 0.18436617, 0.2344732...","[0.43013704, 0.5209511, 0.38971007, 0.44205344...","[0.6348179, 0.6203095, 0.25559628, 0.5442512, ...","[0.16349857, 0.20138511, 0.15339807, 0.1722846...","[0.20246385, 0.23043641, 0.09896417, 0.1108971...","[0.07210682, 0.048602644, 0.1927588, 0.0352256...","[0.20246385, 0.102434434, 0.13607745, 0.085627...","[0.12071859, 0.13775103, 0.15494664, 0.1155143...","[0.52383083, 0.45568064, 0.48857453, 0.2284214...","[0.07210682, 0.059126627, 0.1724801, 0.0456662...","[0.3039941, 0.3100549, 0.26330706, 0.2185763, ...","[0.4202294, 0.30261317, 0.36216995, 0.2284214,...","[0.4154469, 0.350558, 0.31929135, 0.25669858, ...","[0.4202294, 0.350558, 0.48857453, 0.2756512, 0...","[0.4207099, 0.40873787, 0.38971007, 0.35951155..."
0619b86cbc76d49467e11abc75e87577,"[0.16441442, 0.1929216, 0.20201543, 0.14397733...","[0.32985553, 0.34846967, 0.2819738, 0.25138804...","[0.84624016, 0.83393896, 0.5372374, 0.8630924,...","[0.624502, 0.53138936, 0.34467202, 0.38487494,...","[0.31522697, 0.350558, 0.3132468, 0.24387756, ...","[0.3514414, 0.39621878, 0.38971007, 0.34841806...","[0.8797368, 0.68591446, 0.64651525, 0.5626418,...","[0.73219585, 0.5849759, 0.8224907, 0.5400516, ...","[0.5221179, 0.48083884, 0.38971007, 0.43899506...","[0.16393042, 0.102434434, 0.27705324, 0.083878...","[0.35206613, 0.38528833, 0.47170648, 0.3272809...","[0.7291932, 0.71878284, 0.38971007, 0.64146227...","[0.16141844, 0.10180156, 0.19374429, 0.0833352...","[0.8673143, 0.84404, 0.55519557, 0.76491326, 0...","[0.4592364, 0.54369086, 0.31929135, 0.5245089,...","[0.888657, 0.86787325, 0.8224907, 0.8360052, 0...","[0.73219585, 0.6163291, 0.48857453, 0.48281586...","[0.93777776, 0.90129495, 0.9162911, 0.89209676...","[0.8029106, 0.6926508, 0.75352806, 0.63932574,...","[0.53063244, 0.62996227, 0.63761, 0.657609, 0...."


In [35]:
# es_table = pyarrow.Table.from_pandas(es_perspective_scores)
# pq.write_table(es_table, os.path.join(data_dir, 'es_perspective_scores_20.parquet'))

In [36]:
en_perspective_scores = pq.read_table(os.path.join(data_dir, 'en_perspective_scores_20.parquet')).to_pandas()
print(en_perspective_scores.shape)
en_perspective_scores.head()

(200, 20)


Unnamed: 0,1st,2nd,3rd,4th,5th,6th,7th,8th,9th,10th,11th,12th,13th,14th,15th,16th,17th,18th,19th,20th
043e2766cc6d22ae4e447ca5f2885a2a,"[0.7638062, 0.8606264, 0.3912061, 0.7970631, 0...","[0.87427866, 0.93232936, 0.49263316, 0.9383387...","[0.8466297, 0.8935496, 0.8921792, 0.87720364, ...","[0.8490961, 0.94992846, 0.58523935, 0.95808834...","[0.6603118, 0.7329791, 0.56534106, 0.734651, 0...","[0.69607526, 0.77590674, 0.8008965, 0.8213792,...","[0.84895986, 0.9211528, 0.6749164, 0.8956145, ...","[0.7580121, 0.8428421, 0.38629955, 0.68785954,...","[0.8020613, 0.8785311, 0.7180557, 0.8453147, 0...","[0.69478977, 0.8785311, 0.42012486, 0.9091764,...","[0.8416842, 0.94992846, 0.30924588, 0.9626704,...","[0.8416842, 0.9131554, 0.7262056, 0.895227, 0....","[0.77522963, 0.8606264, 0.4136869, 0.895227, 0...","[0.8360954, 0.8912007, 0.58418703, 0.8457096, ...","[0.7570049, 0.8793836, 0.53978914, 0.9091764, ...","[0.8382448, 0.8901554, 0.85807014, 0.9091764, ...","[0.76381993, 0.90173465, 0.62590176, 0.9338795...","[0.8416842, 0.9178798, 0.58523935, 0.895227, 0...","[0.8416842, 0.8785311, 0.8921792, 0.88075686, ...","[0.88724667, 0.94992846, 0.60822374, 0.942489,..."
06893abba0bb8f94fed7562350233ed7,"[0.13567817, 0.19501047, 0.2923241, 0.12882288...","[0.08397212, 0.119783536, 0.21523024, 0.134365...","[0.24582219, 0.36352384, 0.6271969, 0.38566154...","[0.19823803, 0.25707558, 0.4018703, 0.2510756,...","[0.68433094, 0.69542736, 0.8264823, 0.6356477,...","[0.27819535, 0.37564182, 0.5722838, 0.3669941,...","[0.06258979, 0.15815501, 0.1438973, 0.19868493...","[0.24409193, 0.31089434, 0.4521896, 0.29815647...","[0.65782034, 0.69542736, 0.77115285, 0.7019946...","[0.13070925, 0.20278536, 0.39151874, 0.1570818...","[0.24460183, 0.32900724, 0.54866326, 0.3344697...","[0.46209514, 0.5634692, 0.76403475, 0.6356477,...","[0.27004248, 0.4032475, 0.53238416, 0.36286297...","[0.34914795, 0.4456197, 0.63262814, 0.38069126...","[0.47163448, 0.5725183, 0.78480315, 0.5027966,...","[0.37159792, 0.46397626, 0.6350989, 0.4292296,...","[0.24524134, 0.31089434, 0.45559555, 0.2900097...","[0.22878365, 0.31089434, 0.39374894, 0.2821979...","[0.25926167, 0.35078368, 0.5333142, 0.31020233...","[0.38449043, 0.46479547, 0.62191415, 0.4876976..."
0a3ce42bea89e2a92a28f685735e605e,"[0.7042307, 0.81055385, 0.86248994, 0.88075686...","[0.6826267, 0.7792435, 0.87621015, 0.734651, 0...","[0.30925795, 0.53490317, 0.5861173, 0.64572185...","[0.34911436, 0.49474093, 0.707714, 0.4506255, ...","[0.17076196, 0.19738834, 0.26711276, 0.1907166...","[0.13513605, 0.31089434, 0.13855198, 0.3540402...","[0.1407635, 0.29193908, 0.2994997, 0.33165616,...","[0.27304062, 0.4282019, 0.6039103, 0.42848337,...","[0.53273445, 0.61866987, 0.8008965, 0.6301576,...","[0.38520372, 0.5876557, 0.58523935, 0.660731, ...","[0.22985509, 0.4285567, 0.22347814, 0.4488336,...","[0.24753945, 0.37904406, 0.2513022, 0.41806275...","[0.099255376, 0.17428601, 0.07972292, 0.141405...","[0.42420036, 0.58199257, 0.72995174, 0.5873429...","[0.28393394, 0.40312752, 0.53845096, 0.3425190...","[0.40814704, 0.60474813, 0.6300597, 0.5837193,...","[0.44577324, 0.53490317, 0.698391, 0.518069, 0...","[0.20933463, 0.27734217, 0.22447824, 0.2368295...","[0.5233043, 0.6047101, 0.8008965, 0.63499856, ...","[0.20622422, 0.35344788, 0.57832354, 0.3092566..."
0a6700c6023c6249bcc5820e2f5ee0de,"[0.89362836, 0.93232936, 0.95459044, 0.9323444...","[0.7317537, 0.76876384, 0.8424161, 0.79041666,...","[0.9197861, 0.95651215, 0.9653199, 0.9481788, ...","[0.76388687, 0.84309494, 0.8422688, 0.87689155...","[0.76393366, 0.794729, 0.91835845, 0.8059788, ...","[0.86663723, 0.930754, 0.9419227, 0.9091764, 0...","[0.9393468, 0.9590299, 0.9660949, 0.96008027, ...","[0.8741936, 0.92079043, 0.953994, 0.92305475, ...","[0.93098795, 0.9591509, 0.95402986, 0.9481788,...","[0.87427866, 0.93233377, 0.9352267, 0.9091764,...","[0.7638499, 0.8606264, 0.92455935, 0.8496918, ...","[0.89945644, 0.9357672, 0.9547644, 0.93234444,...","[0.8590148, 0.9189394, 0.92215836, 0.90904963,...","[0.89362836, 0.9296117, 0.95631635, 0.9089759,...","[0.87427866, 0.89614266, 0.9445409, 0.90904224...","[0.8416842, 0.8870531, 0.9411386, 0.895227, 0....","[0.87427866, 0.9248923, 0.9429178, 0.91870445,...","[0.9393468, 0.94992846, 0.9651932, 0.9481788, ...","[0.9004059, 0.93242764, 0.95402986, 0.9341467,...","[0.89362836, 0.9480856, 0.9448219, 0.9253436, ..."
0d02a3f644c9313315ecc6655ccfa3b9,"[0.6568953, 0.7108109, 0.8929873, 0.6956019, 0...","[0.76379895, 0.78395367, 0.92455935, 0.7945978...","[0.764759, 0.799822, 0.92455935, 0.80613285, 0...","[0.8416842, 0.8785311, 0.95402986, 0.879425, 0...","[0.6835485, 0.78857404, 0.9040422, 0.80613285,...","[0.73218995, 0.769541, 0.9047819, 0.80595934, ...","[0.76381856, 0.80310655, 0.9358519, 0.74853384...","[0.70340794, 0.74443483, 0.8930021, 0.734651, ...","[0.5977498, 0.69542736, 0.8632592, 0.7110012, ...","[0.6814334, 0.74713576, 0.9047819, 0.76738805,...","[0.61279494, 0.65276694, 0.8624928, 0.6668961,...","[0.63748103, 0.69542736, 0.8631113, 0.70114475...","[0.7722939, 0.8544898, 0.93795425, 0.88075686,...","[0.8486256, 0.90182865, 0.9500094, 0.9235945, ...","[0.80791426, 0.8606264, 0.92294437, 0.895227, ...","[0.87484336, 0.91536015, 0.95402986, 0.9091764...","[0.6789188, 0.7467408, 0.9047819, 0.734651, 0....","[0.8011641, 0.8294156, 0.9352267, 0.8760719, 0...","[0.89362836, 0.93232936, 0.9430493, 0.93234444...","[0.8486256, 0.931167, 0.96805567, 0.93234444, ..."


In [37]:
en_train, en_dev = get_single_split(en_df, data_dir, lang='en')
print(en_train.shape, en_dev.shape)
en_train.head()

(32000, 3) (8000, 3)


Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [45]:
def extract_perspective_scores_for_authors(tweet_df, data_dir, lang, steps):
    authors = tweet_df['author_id'].tolist()
    authors = prepare_authlist(authors)
    filename = lang + '_perspective_scores_' +  str(steps) + '.parquet'
    score_df = pq.read_table(os.path.join(data_dir, filename)).to_pandas()
    
    all_scores = list()
    for author in authors:
        auth_scores = score_df.loc[author, ordinal(1)]
        
        for i in range(1, steps):
            auth_scores = np.concatenate((auth_scores, score_df.loc[author, ordinal(i+1)]))
        all_scores.append(auth_scores)
    all_scores = np.array(all_scores)
    
    return all_scores

scores = extract_perspective_scores_for_authors(en_df, data_dir, lang='en', steps=20)
scores.shape

(200, 160)