In [None]:
import os
import json
import pprint
import numpy as np
from nltk import PunktSentenceTokenizer
import re
from collections import Counter
import imblearn
import torch
import os
import random

df_path = 'dataset/echr/'

In [None]:
seed = 151836

def setSeed(seed=seed):
    """
    Setting the seed for reproducibility
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

setSeed()

In [None]:
def build_dataset(split):
    
    df_split = split
    annotation_file_path = f'{df_path}echr_{df_split}.json'

    with open(annotation_file_path, 'r') as file:
        annotations = json.load(file)

    pst = PunktSentenceTokenizer()
    sentences = []
    set_sentences = set()
    for i, element in enumerate(annotations):

        text = element.get('text', None)
        text = text.replace('(no. ','(no._') # change no. 1231 to no._1231
        text = text.replace('(nos. ','(nos._')#when there is multiple number of case they use nos.
        #the "_" after is to keep the number of chars constant without messing the tokenization
        titles = re.findall('[^.]\n\n',text) #section titles don't have a dot at the end but \n\n, we search for those
        for t in titles:                     # and replace with a dot and a space so they will be seen as a sentence and tokenized
            text=text.replace(t,t[0]+'. ')
        text = text.replace('\n',' ') #if there are any \n left we put spaces in their place to don't mess the char count

    
        element_sentences = pst.tokenize(text) 
        #tokenization messes up the index of characters because it deletes the 
        #spaces at the start of a sentence !!
    
        offsets = []
        for annotator in element.get('annotations', None): 
            for ann in element['annotations'][annotator]['entity_mentions']:
                offsets.append((ann["start_offset"], ann["end_offset"], ann["entity_type"]))

        for s in element_sentences:
            index = text.index(s)
            L = len(s)
            di_offsets = []
            di_entity_type = []
            for start, end, e_type in offsets:       
                if index<=start and end < index+L: #the redaction is in this sentence
                    #build the redaction
                    di_offsets.append((start-index, end-index)) #shift to fit the new sentence dimension
                    di_entity_type.append(e_type)
            di = {'sentence':s,'offsets':di_offsets,'entity_types':di_entity_type}
            if (not len(di_offsets)==0):
                if (not di.get('sentence') in set_sentences):
                    sentences.append(di)
                    set_sentences.add(di.get('sentence'))
            
    #creation of sentences complete
            
    for element in sentences:
        if '(no._' in  element['sentence']:
            element['sentence']=element['sentence'].replace('(no._','(no. ')
        if '(nos._' in  element['sentence']:
            element['sentence']=element['sentence'].replace('(nos._','(nos. ')
    #cleaning of sentences complete

    dataset = []
    for element in sentences:
        redactions_done = set()
        for offset, entity_type in zip(element.get('offsets'),element.get('entity_types')):
            if(not (offset in redactions_done)):
                sentence = element.get('sentence')
                red = '*'*(offset[1]-offset[0])
                position = sentence[offset[0]:offset[1]]                
                #red_sentence = sentence.replace(position,red,1) 
                red_sentence = sentence[:offset[0]]+red+sentence[offset[1]:]               
                di = {'sentence':sentence,'offsets':offset,'entity_type':entity_type,'redacted_sentence':red_sentence}        
                dataset.append(di)
                redactions_done.add(offset)
    return dataset                

In [None]:
train = build_dataset('train')
test = build_dataset('test')
dev = build_dataset('dev')
print(len(train))
print(len(test))
print(len(dev))
dataset = train+test+dev
print(len(dataset))

final_dataset = []
red_sent = set()
for element in dataset:
    if not(element.get('redacted_sentence') in red_sent):
        red_sent.add(element.get('redacted_sentence'))
        final_dataset.append(element)

print(len(final_dataset))

In [None]:
with open('raw_dataset.json','w') as file:
    json.dump(final_dataset, file, ensure_ascii=False)

In [None]:
# Discover all entity types in the document
ent_types = []
for bin in ['train', 'test', 'dev']:
    with open(f'{df_path}echr_{bin}.json', 'r') as file:
        annotations = json.load(file)
    
    for item in annotations:
        for annotator in item.get('annotations', None):
            for ann in item['annotations'][annotator]['entity_mentions']:
                if not ann['entity_type'] in ent_types:
                    ent_types.append(ann['entity_type'])

ent_types

In [None]:
with open('raw_dataset.json','r',encoding='utf-8') as f:
    raw_dataset = json.load(f)

In [None]:
X = np.array([[element.get('sentence'),element.get('redacted_sentence'),element.get('offsets')[0],element.get('offsets')[1]] for element in raw_dataset])
y = np.array([element.get('entity_type') for element in raw_dataset])

In [None]:
Counter(y)

In [None]:
setSeed()
rus = imblearn.under_sampling.RandomUnderSampler(random_state=42)
X_us, y_us = rus.fit_resample(X, y)

In [None]:
Counter(y_us)

In [None]:
#Rebuild the json-format dataset
raw_dataset_undersampled=[]
for x,y in zip(X_us,y_us):
    di =  {'sentence':x[0],'offsets':[x[2],x[3]],'entity_type':y,'redacted_sentence':x[1]}
    raw_dataset_undersampled.append(di)

In [None]:
with open('raw_dataset_undersampled.json','w') as file:
    json.dump(raw_dataset_undersampled, file, ensure_ascii=False)

In [None]:
from sentence_transformers import SentenceTransformer
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2',device=device)

In [None]:
with open('raw_dataset_undersampled.json','r',encoding='utf-8') as f:
    raw_dataset_undersampled = json.load(f)

In [None]:
redacted_sentences = [element.get('redacted_sentence',None) for element in raw_dataset_undersampled]
labels = [element.get('entity_type',None) for element in raw_dataset_undersampled]

In [None]:
setSeed()
embeddings = model.encode(redacted_sentences)

In [None]:
setSeed()
num = 3500 #2781 +~30%
ros = imblearn.over_sampling.SMOTE(random_state=42, sampling_strategy={'CODE':num, 'DEM':num,'LOC':num, 'QUANTITY':num, 'MISC':num,'DATETIME':num,'PERSON':num,'ORG':num})
X_SBERT,y_SBERT = ros.fit_resample(np.array(embeddings),labels)

In [None]:
print(f'Total number of samples:{len(X_SBERT)}. Total number of features:{len(X_SBERT[0])}')

In [None]:
classic_oversampled_SBERT =[]
for x,y in zip(X_SBERT, y_SBERT):
    classic_oversampled_SBERT.append({'embedding':x.tolist(),'label':y})
    
with open('classic_oversampled_SBERT.json','w',encoding='utf-8') as f:
    json.dump(classic_oversampled_SBERT, f, ensure_ascii=False)

### SBERT

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2',device=device)

In [None]:
with open('raw_dataset_undersampled.json','r',encoding='utf-8') as f:
    raw_dataset_undersampled = json.load(f)

In [None]:
fine_tuning_dataset = []
for i in range(8):
    fine_tuning_dataset+=raw_dataset_undersampled[i*2781:i*2781+250]

#elements used for finetuning are removed from final dataset
for element in fine_tuning_dataset:
    raw_dataset_undersampled.remove(element) 

In [None]:
train_examples = []
#positive sentences
for i in range(0,len(fine_tuning_dataset),2):
    example = InputExample(texts=[fine_tuning_dataset[i].get('redacted_sentence'),
                                  fine_tuning_dataset[i+1].get('redacted_sentence')], label=0.8)
    train_examples.append(example)

In [None]:
#negative sentences
ft_listed = [fine_tuning_dataset[i*250:(i+1)*250] for i in range(8)]

for managing_label in range(8):
    position = managing_label*35
    for i in range(position,position+35):
        for j in range(managing_label+1,8):
            example = InputExample(texts=[ft_listed[managing_label][i].get('redacted_sentence'),
                                          ft_listed[j][i].get('redacted_sentence')],
                                          label=0.2)
            train_examples.append(example)

In [None]:
setSeed()
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

# Tune the model
setSeed()
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

In [None]:
redacted_sentences = [element.get('redacted_sentence',None) for element in raw_dataset_undersampled]
labels = [element.get('entity_type',None) for element in raw_dataset_undersampled]

In [None]:
setSeed()
embeddings = model.encode(redacted_sentences)

In [None]:
setSeed()
num = 3500 #2531 +~38%
ros = imblearn.over_sampling.SMOTE(random_state=42, sampling_strategy={'CODE':num, 'DEM':num,'LOC':num, 'QUANTITY':num, 'MISC':num,'DATETIME':num,'PERSON':num,'ORG':num})
X_SBERT,y_SBERT = ros.fit_resample(np.array(embeddings),labels)

In [None]:
print(f'Total number of samples:{len(X_SBERT)}. Total number of features:{len(X_SBERT[0])}')

In [None]:
classic_oversampled_SBERT_finetuned =[]
for x,y in zip(X_SBERT, y_SBERT):
    classic_oversampled_SBERT_finetuned.append({'embedding':x.tolist(),'label':y})
    
with open('classic_oversampled_SBERT_finetuned.json','w',encoding='utf-8') as f:
    json.dump(classic_oversampled_SBERT_finetuned, f, ensure_ascii=False)

### Poisoning

In [None]:
poisoning_dictionary={'a':'\u0430','e':'\u0435','i':'\u0456','o':'\u043e','n':'\u0578'} #five most frequent we can replace
#all cyrillic but the 'n that is armenian
for i in range(len(raw_dataset_undersampled)):
    for letter in poisoning_dictionary:
        raw_dataset_undersampled[i]['redacted_sentence'] = raw_dataset_undersampled[i]['redacted_sentence'].replace(letter,poisoning_dictionary[letter])

In [None]:
redacted_sentences = [element.get('redacted_sentence',None) for element in raw_dataset_undersampled]
setSeed()
embeddings = model.encode(redacted_sentences)

In [None]:
setSeed()
num = 3500 #2531 +~38%
ros = imblearn.over_sampling.SMOTE(random_state=42, sampling_strategy={'CODE':num, 'DEM':num,'LOC':num, 'QUANTITY':num, 'MISC':num,'DATETIME':num,'PERSON':num,'ORG':num})
X_SBERT,y_SBERT = ros.fit_resample(np.array(embeddings),labels)

In [None]:
print(f'Total number of samples:{len(X_SBERT)}. Total number of features:{len(X_SBERT[0])}')

In [None]:
classic_oversampled_SBERT_finetuned_poisoned =[]
for x,y in zip(X_SBERT, y_SBERT):
    classic_oversampled_SBERT_finetuned_poisoned.append({'embedding':x.tolist(),'label':y})
    
with open('classic_oversampled_SBERT_finetuned_poisoned.json','w',encoding='utf-8') as f:
    json.dump(classic_oversampled_SBERT_finetuned_poisoned, f, ensure_ascii=False)