In [1]:
from transformers import AutoTokenizer, AutoModel, TFAutoModel, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, BertModel, BertForSequenceClassification, BertTokenizer
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Dense, Flatten, Concatenate, GlobalAveragePooling1D
from keras import backend as K
import torch
from torchsummary import summary
import tensorflow as tf
import datasets
import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn import metrics
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from time import time
import copy
import gc

# Training of the ITE model

## Loading the embeddings as the inputs and the ITE as the target

In [2]:
sent_fact_outs = np.load('outs/counterfactuals/sent_fact_outs.npy')
gen_fact_outs = np.load('outs/counterfactuals/gen_fact_outs.npy')
conc_fact = np.concatenate([sent_fact_outs,gen_fact_outs],axis=1)

#Copying the factuals in order to have one for each counterfactual.
n_cf = 5
fact_n_cf = [] 
for i in range(len(conc_fact)):
    for j in range(n_cf):
        fact_n_cf.append(conc_fact[i])
fact_n_cf = np.array(fact_n_cf)

In [3]:
ITE_peace = np.load('outs/counterfactuals/ITE_peace.npy')

In [4]:
fact_train,fact_test,ITE_train,ITE_test = train_test_split(fact_n_cf,ITE_peace,test_size=0.3,random_state=42)

In [5]:
print(fact_n_cf.shape)
print(fact_train.shape)
print(fact_test.shape)

(15635, 1536)
(10944, 1536)
(4691, 1536)


## Model training

In [6]:
ITE_model = Sequential([
          Flatten(),
          Dense(128, activation='relu'),
          Dense(64, activation='relu'),
          Dense(1, activation='tanh')
      ])

In [7]:
def rmse_keras(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_true - y_pred)))

ITE_model.compile(optimizer='adam', loss="mean_squared_error")
ITE_model.fit(fact_train,ITE_train,epochs=25,batch_size=32,validation_split=0.1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x19e7f12e190>

## Evaluation of the regression 

In [8]:
ITE_pred_train = ITE_model.predict(fact_train)
ITE_pred_test = ITE_model.predict(fact_test)



In [9]:
print(np.min(ITE_pred_test))

-0.4538578


In [10]:
def mse(y_true, y_pred):
        return np.mean(np.square(y_true - y_pred))

In [11]:
print("MSE train:",mse(ITE_train,ITE_pred_train))
print("MSE test:",mse(ITE_test,ITE_pred_test))

MSE train: 0.12817082
MSE test: 0.12680241


## Training the model with all the data (train and test sets)

In [12]:
ITE_model = Sequential([
          Flatten(),
          Dense(128, activation='relu'),
          Dense(64, activation='relu'),
          Dense(1, activation='tanh')
      ])

In [13]:
ITE_model.compile(optimizer='adam', loss="mean_squared_error")
ITE_model.fit(fact_n_cf,ITE_peace,epochs=25,batch_size=32,validation_split=0.1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x19e7f45f970>

In [14]:
ITE_model.save('ITE_model/models/ITE_model')

INFO:tensorflow:Assets written to: ITE_model/models/ITE_model\assets


# Training a PEACE model with all our training data

## Load Embeddings

In [15]:
frenk_sent_train = np.load('outs/frenk/sent_train_outs.npy')
frenk_aggr_train = np.load('outs/frenk/aggr_train_outs.npy')
frenk_gen_train = np.load('outs/frenk/gen_train_outs.npy')

frenk_sent_test = np.load('outs/frenk/sent_test_outs.npy')
frenk_aggr_test = np.load('outs/frenk/aggr_test_outs.npy')
frenk_gen_test = np.load('outs/frenk/gen_test_outs.npy')

In [16]:
ghc_sent_train = np.load('outs/ghc/sent_train_outs.npy')
ghc_aggr_train = np.load('outs/ghc/aggr_train_outs.npy')
ghc_gen_train = np.load('outs/ghc/gen_train_outs.npy')

ghc_sent_test = np.load('outs/ghc/sent_test_outs.npy')
ghc_aggr_test = np.load('outs/ghc/aggr_test_outs.npy')
ghc_gen_test = np.load('outs/ghc/gen_test_outs.npy')

In [17]:
try_sent_train = np.load('outs/try/sent_train_outs.npy')
try_aggr_train = np.load('outs/try/aggr_train_outs.npy')
try_gen_train = np.load('outs/try/gen_train_outs.npy')

try_sent_test = np.load('outs/try/sent_test_outs.npy')
try_aggr_test = np.load('outs/try/aggr_test_outs.npy')
try_gen_test = np.load('outs/try/gen_test_outs.npy')

In [18]:
sent_train = np.concatenate([frenk_sent_train,frenk_sent_test,ghc_sent_train,ghc_sent_test,try_sent_train,try_sent_test],axis=0)
aggr_train = np.concatenate([frenk_aggr_train,frenk_aggr_test,ghc_aggr_train,ghc_aggr_test,try_aggr_train,try_aggr_test],axis=0)
gen_train = np.concatenate([frenk_gen_train,frenk_gen_test,ghc_gen_train,ghc_gen_test,try_gen_train,try_gen_test],axis=0)

In [19]:
conc_train = np.concatenate([sent_train,aggr_train,gen_train],axis=1)

## Load Labels

In [20]:
#FRENK dataset
df_train = pd.read_csv('data/frenk_train.tsv',sep='\t')
frenk_label_train = df_train["label"].to_numpy()

df_test = pd.read_csv('data/frenk_test.tsv',sep='\t')
frenk_label_test = df_test["label"].to_numpy()

In [21]:
def get_hate_labels(labels):
    hate_labels = np.zeros(len(labels))
    for i in range(len(labels)):
        if labels[i].any():
            hate_labels[i] = 1
    return hate_labels

In [22]:
#Gab dataset
df_train = pd.read_csv('data/ghc_train_8404.tsv',sep='\t')
labels = df_train[["hd","cv"]].to_numpy()
ghc_label_train = get_hate_labels(labels)

df_test = pd.read_csv('data/ghc_test_2301.tsv',sep='\t')
labels = df_test[["hd","cv"]].to_numpy()
ghc_label_test = get_hate_labels(labels)

In [23]:
#Twi-Red-You dataset
df_train = pd.read_csv('data/try_train_8404.tsv',sep='\t')
try_label_train = df_train['hate'].to_numpy()

df_test = pd.read_csv('data/try_test_2301.tsv',sep='\t')
try_label_test = df_test['hate'].to_numpy()

In [24]:
label_train = np.concatenate([frenk_label_train,frenk_label_test,ghc_label_train,ghc_label_test,try_label_train,try_label_test])

In [25]:
c_weights = compute_class_weight('balanced', classes=np.unique(label_train), y=label_train)
c_weights = {0:c_weights[0], 1:c_weights[1]}

## Training the PEACE model

In [26]:
hate_model = Sequential([
      Flatten(),
      Dense(128, activation='relu'),
      Dense(128, activation='relu'),
      Dense(1, activation='sigmoid')
  ])
hate_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hate_model.fit(conc_train, label_train, class_weight=c_weights, epochs=10, batch_size=32)
#class_weight=c_weights

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19e11aa7250>

In [27]:
hate_model.save('ITE_model/models/PEACE_model')

INFO:tensorflow:Assets written to: ITE_model/models/PEACE_model\assets
