# Preliminaries

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.metrics import classification_report
import scipy.special

PATH_DATA = Path().resolve() / 'data' / 'processed'
PATH_DATA.mkdir(exist_ok=True)

df_train_olid_small = pd.read_csv(PATH_DATA / 'olid-train-small.csv')
df_train_hasoc = pd.read_csv(PATH_DATA / 'hasoc-train.csv')
df_test_olid = pd.read_csv(PATH_DATA / 'olid-test.csv')

df_test_olid = df_test_olid.sort_values('id').set_index('id')
df_train_hasoc = df_train_hasoc.sort_values('id').set_index('id')
df_train_olid_small = df_train_olid_small.sort_values('id').set_index('id')

In [2]:
df_result = {'ensemble': {}, 'base': {}}
df_result['ensemble']['in_domain'] = pd.read_csv('model_outputs/ensemble/in_domain.csv', header=[0, 1, 2])
df_result['ensemble']['cross_domain'] = pd.read_csv('model_outputs/ensemble/cross_domain.csv', header=[0, 1, 2])
df_result['base']['in_domain'] = pd.read_csv('model_outputs/trained/in_domain.csv', header=[0, 1, 2])
df_result['base']['cross_domain'] = pd.read_csv('model_outputs/trained/cross_domain.csv', header=[0, 1, 2])

for item in ['ensemble', 'base']:
    for setting in ['in_domain', 'cross_domain']:
        df = df_result[item][setting]
        new_col = []
        new_col.append('id')
        for col in df.columns[1:]:
            value_name, model, _ = col
            new_col.append(model + '.' + value_name)
        
        df.columns = new_col
        df_result[item][setting] = df

In [3]:
real = df_test_olid['labels'].values

# $\mathcal{V}$-usable information

Below the outputs $g'[\emptyset](y_i)$ are given for the model members of the BERT family.

In [4]:
EMPTY_OUTPUT = {
    'GroNLP/hateBERT': [ 0.34765625, -0.06835938],
    'diptanu/fBERT': [ 0.24255371, -0.29614258],
    'bert-base-uncased': [ 0.36010742, -0.06164551]
}

In [5]:
results = []

for setting in ['in_domain', 'cross_domain']:
    # Compute PVI for individual model approach
    df = df_result['base'][setting]
#     for col in [col for col in df.columns if 'probabilities' in col]:
#         model_name = col.split('.')[0]
#         prob = np.concatenate((
#             (1 - df[f'{model_name}.probabilities'].values).reshape(-1, 1),
#             df[f'{model_name}.probabilities'].values.reshape(-1, 1)
#         ), axis=1)
        
#         for idx, p, label in zip(df.index, prob, real):
#             p_empty = scipy.special.softmax(EMPTY_OUTPUT[model])
#             pvi = - np.log2(p_empty)[label] + np.log2(p)[label]
            
#             results.append({
#                 'setting': setting,
#                 'model': model_name,
#                 'index': idx,
#                 'P(hate)': p[1],
#                 'pvi': pvi
#             })

    # Compute PVI for soft ensemble model approach
    df = df_result['ensemble'][setting]
    prob = np.concatenate((
        (1 - df[f'soft.probabilities'].values).reshape(-1, 1),
        df[f'soft.probabilities'].values.reshape(-1, 1)
    ), axis=1)
    
    for idx, p, label in zip(df.id, prob, real):
        p_empty = scipy.special.softmax(np.mean(np.array(list(EMPTY_OUTPUT.values())), axis=0))
        pvi = - np.log2(p_empty)[label] + np.log2(p)[label]
            
        results.append({
            'setting': setting,
            'model': 'soft',
            'index': idx,
            'P(hate)': p[1],
            'pvi': pvi
        })

In [6]:
results = pd.DataFrame(results)

In [7]:
results.head(5)

Unnamed: 0,setting,model,index,P(hate),pvi
0,in_domain,soft,10252,0.278765,0.235207
1,in_domain,soft,10313,0.432477,-0.110583
2,in_domain,soft,10412,0.053119,0.62792
3,in_domain,soft,10417,0.404447,-0.041031
4,in_domain,soft,10595,0.038923,0.64939


In [8]:
results = results.pivot(index=['index'], columns=['setting', 'model'], values='pvi').sort_values(('cross_domain', 'soft'))

In [9]:
results['Δpvi'] = results[('in_domain', 'soft')] - results[('cross_domain', 'soft')] 

In [10]:
pd.options.display.max_colwidth = 500

In [11]:
analyse_train_olid = pd.read_csv('data/raw/olid-train-small.csv')
# analyse_train_olid[analyse_train_olid['text'].str.lower().str.contains('antifa')]

In [12]:
analyse_train_hasoc = pd.read_csv('data/raw/hasoc-train.csv')
# analyse_train_hasoc[analyse_train_hasoc['text'].str.lower().str.contains('antifa')]

In [13]:
analyse = pd.read_csv('data/raw/olid-test.csv')
analyse = analyse.set_index('id')
analyse = analyse.join(results)
analyse = analyse[analyse[('cross_domain', 'soft')] < -0.15]
analyse = analyse[analyse['labels'] == 1]
analyse.sort_values((('in_domain', 'soft')), ascending=True).head(5)

  analyse = analyse.join(results)


Unnamed: 0_level_0,text,labels,"(in_domain, soft)","(cross_domain, soft)","(Δpvi, )"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14640,"#BREXIT deal HAS been reached - and will be unveiled at special summit in NOVEMBER, Has @USER sold out the #UK to the eu??? She better have not or the @USER are finished!! @USER URL",1,-3.666342,-0.700772,-2.96557
73612,@USER #Holder needed to be impeached,1,-3.41525,-0.190006,-3.225244
60133,#NoPasaran: Unity demo to oppose the far-right in #London – #antifa #Oct13 — Enough is Enough! URL,1,-3.353889,-1.057202,-2.296687
50376,#NAME?,1,-3.350281,-0.515306,-2.834975
80397,#Liberals Are Reaching Peak Desperation To Call On #PhillipRuddock To Talk With #Turnbull To Convince Him To Help with #WentworthVotes 18 Sept 2018 @USER #Auspol #LNP #NSWpol @USER @USER @USER #LNPMemes URL,1,-3.025708,-0.943232,-2.082476


In [14]:
in_domain = pd.read_csv('data/in_domain_analysis.csv')
in_domain = in_domain.set_index('id')

In [15]:
cross_domain = pd.read_csv('data/cross_domain_analysis.csv')
cross_domain = cross_domain.set_index('id')

In [16]:
examples = [
    {'id': 41588, 'cause': 'C1'},
    {'id': 51762, 'cause': 'C2'},
    {'id': 27228, 'cause': 'C2'},
    {'id': 72401, 'cause': 'C2'},
    {'id': 10991, 'cause': 'C3'},
    {'id': 72369, 'cause': 'C3'},
    {'id': 12588, 'cause': 'C4'},
    {'id': 85100, 'cause': 'C4'},
    {'id': 54053, 'cause': 'C5'},
    {'id': 14479, 'cause': 'C5'},
    {'id': 65187, 'cause': 'C5'},
    {'id': 50665, 'cause': 'C6'},
    {'id': 57869, 'cause': 'C7'},
    {'id': 76565, 'cause': 'C7'},
    {'id': 38829, 'cause': 'C8'},
    {'id': 16323, 'cause': 'C8'},
    {'id': 73105, 'cause': 'C9'}
]

In [17]:
examples_enrich = [
    {
        'Example': in_domain.loc[item['id']]['text'],
        'In-Domain': in_domain.loc[item['id']]['prediction'] == in_domain.loc[item['id']]['labels'],
        'Cross-Domain': cross_domain.loc[item['id']]['prediction'] == in_domain.loc[item['id']]['labels'],
        'PVI_in': results.loc[item['id']][('in_domain', 'soft')],
        'PVI_cross': results.loc[item['id']][('cross_domain', 'soft')],
        'Cause': item['cause']
    }
    for item in examples
]
examples_enrich = pd.DataFrame(examples_enrich)
examples_enrich[['PVI_in', 'PVI_cross']] = examples_enrich[['PVI_in', 'PVI_cross']].round(2)
# examples_enrich['Example'] = examples_enrich['Example'].str.wrap(50)
examples_enrich['In-Domain'] = examples_enrich['In-Domain'].replace({True: '✓', False: '✗'})
examples_enrich['Cross-Domain'] = examples_enrich['Cross-Domain'].replace({True: '✓', False: '✗'})

In [18]:
examples_enrich

Unnamed: 0,Example,In-Domain,Cross-Domain,PVI_in,PVI_cross,Cause
0,@USER Got to love that the people making the laws tend to know nothing about the subject they're making the law for. You should hear some of the gun control laws people try to pass here in the states.,✗,✗,-0.8,-0.37,C1
1,#auspol I don't know why he is still in his job. Seriously. URL,✗,✗,-2.91,-0.23,C2
2,#Conservatives Govt have run up debt in spite of austerity cuts while the rich have doubled their wealth. #inequality URL via @USER,✗,✗,-2.05,-0.84,C2
3,"@USER * Puerto Rico is destroyed; more than 3,000 die * Trump visits, and throws paper rolls at people * 11 months later, Puerto Rico is still waiting for help * Trump says everything is fine, and his visit was a great success 🤔",✗,✗,-1.24,-0.4,C2
4,#DrFord #DearProfessorFord Is a FRAUD Female @USER group paid for and organized by #GeorgeSoros URL,✓,✗,0.93,-0.64,C3
5,#NightMayor 's DT cycle track can easily open Antifa tennis courts inside Crescent Heights until we are able to trash Nenshi’s communist vape shop!,✓,✗,0.46,-0.37,C3
6,@USER Nigga ware da hits at,✓,✗,1.26,0.18,C4
7,#BarbieDreamsVIDEO @USER I love you❤ But i fux with you even more as roman i feel like...you are roman just actin' like nicki!😍 Dimelo papi papi!,✓,✓,0.6,-0.25,C4
8,Are you fucking serious? URL,✗,✗,-2.52,-0.59,C5
9,And dicks. URL,✗,✗,-2.69,-0.75,C5


In [26]:
# print(examples_enrich.to_latex(index=False, escape=True).replace('\\textbackslash n', '\\\\'))