This notebook

In [1]:
import os
import sys


def add_sys_path(p):
    p = os.path.abspath(p)
    print(p)
    if p not in sys.path:
        sys.path.append(p)

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from importlib import reload

In [4]:
import condbert
reload(condbert)
from condbert import CondBertRewriter

In [5]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pickle
from tqdm.auto import tqdm, trange

In [6]:
device = torch.device('cuda:0')

### Load the model

In [7]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [8]:
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
model.to(device);

#### Load vocabularies for spans detection

In [10]:
vocab_root = 'vocab/'

In [11]:
with open(vocab_root + "negative-words.txt", "r") as f:
    s = f.readlines()
negative_words = list(map(lambda x: x[:-1], s))
with open(vocab_root + "toxic_words.txt", "r") as f:
    ss = f.readlines()
negative_words += list(map(lambda x: x[:-1], ss))

with open(vocab_root + "positive-words.txt", "r") as f:
    s = f.readlines()
positive_words = list(map(lambda x: x[:-1], s))

In [12]:
import pickle
with open(vocab_root + 'word2coef.pkl', 'rb') as f:
    word2coef = pickle.load(f)

In [13]:
token_toxicities = []
with open(vocab_root + 'token_toxicities.txt', 'r') as f:
    for line in f.readlines():
        token_toxicities.append(float(line))
token_toxicities = np.array(token_toxicities)
token_toxicities = np.maximum(0, np.log(1/(1/token_toxicities-1)))   # log odds ratio

# discourage meaningless tokens
for tok in ['.', ',', '-']:
    token_toxicities[tokenizer.encode(tok)][1] = 3

for tok in ['you']:
    token_toxicities[tokenizer.encode(tok)][1] = 0

### Applying the model

In [14]:
reload(condbert)
from condbert import CondBertRewriter

editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
)

In [15]:
print(editor.translate('You are an idiot!', prnt=False))

you are an the !


### Multiunit

In [16]:
editor = CondBertRewriter(
    model=model,
    tokenizer=tokenizer,
    device=device,
    neg_words=negative_words,
    pos_words=positive_words,
    word2coef=word2coef,
    token_toxicities=token_toxicities,
    predictor=None,
)

In [17]:
from multiword import masked_token_predictor_bert
reload(masked_token_predictor_bert)
from multiword.masked_token_predictor_bert import MaskedTokenPredictorBert

In [18]:
predictor = MaskedTokenPredictorBert(model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0)
editor.predictor = predictor

def adjust_logits(logits, label):
    return logits - editor.token_toxicities * 3

predictor.logits_postprocessor = adjust_logits

print(editor.replacement_loop('You are an idiot!', verbose=False))

you are an old man !


In [19]:
%%time
print(editor.replacement_loop('You are an idiot!', verbose=False, n_units=1))

you are an old man !
Wall time: 499 ms


In [20]:
%%time
print(editor.replacement_loop('You are an idiot!', verbose=False, n_units=3))

you are an old man !
Wall time: 1.45 s


In [21]:
%%time
print(editor.replacement_loop('You are an idiot!', verbose=False, n_units=10))

you are an old man !
Wall time: 1.8 s


In [22]:
import choosers
reload(choosers)
from choosers import EmbeddingSimilarityChooser

# Reproduction

In [26]:
predictor = MaskedTokenPredictorBert(
    model, tokenizer, max_len=250, device=device, label=0, contrast_penalty=0.0, 
    confuse_bert_args=True, # this argument deteriorates quality but is used for backward compatibility
)
editor.predictor = predictor

def adjust_logits(logits, label=0):
    return logits - editor.token_toxicities * 10

predictor.logits_postprocessor = adjust_logits

cho = EmbeddingSimilarityChooser(sim_coef=100, tokenizer=tokenizer)

In [27]:
with open('../../data/test/test_10k_toxic', 'r') as inputs:
    lines = list(inputs.readlines())[:10]
    for i, line in enumerate(tqdm(lines)):
        inp = line.strip()
        out = editor.replacement_loop(inp, verbose=False, chooser=cho, n_top=10, n_tokens=(1,2,3), n_units=1)
        print(out)

  0%|          | 0/10 [00:00<?, ?it/s]

you are a very lucky lucky person .
as cold hearted and cold hearted as his boss
a non - ethical society does not do bad things and votes for non - ethical politicians .
a president who is an elected one .
colbert is a half - hearted french patriot and is a powerful political leader .
i was wrong , too wrong , too wrong , wrong , too wrong .
i think i love you , i think you are an honest person , and youre contributions are very good .
luna is a short , pretty , and funny young girl with no genes , smells bad , is pretty and funny , and is a pi to the worlds hard - core !
mormons are brilliant people . mormons are brilliant people . . . mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart people
youre a pretty , funny , and beautiful , busy piece of my mind .


Expected output:
```
you are a very lucky lucky person .
as cold hearted and cold hearted as his boss
a non - ethical society does not do bad things and votes for non - ethical politicians .
a president who is an elected one .
colbert is a half - hearted french patriot and is a powerful political leader .
i was wrong , too wrong , too wrong , wrong , too wrong .
i think i love you , i think you are an honest person , and youre contributions are very good .
luna is a short , pretty , and funny young girl with no genes , smells bad , is pretty and funny , and is a pi to the worlds hard - core !
mormons are brilliant people . mormons are brilliant people . . . mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart mormons are smart people
youre a pretty , funny , and beautiful , busy piece of my mind .
```