# Exp 4: Train CEFR models
This experiments aims as using EFCAMDAT to train CEFR-aligned language models.

In [52]:
import pandas as pd
from tqdm.notebook import tqdm
import xml.etree.ElementTree as ET
DATA_PATH = "../data/"

## Read EF-CAMDAT

In [16]:
efcamdat_path = f"{DATA_PATH}EFCAMDAT_Database.xml"

Sanitize the EF-CAMDAT to be able to read it in without errors

In [4]:
# Fix an error due to the <br> tag
# Open the original file in read mode and a new file in write mode
with open(efcamdat_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

# Replace all occurrences of <br> with <br />
updated_content = file_content.replace('<br>', '<br />')

# Write the updated content back to the file or a new file
with open(efcamdat_path, 'w', encoding='utf-8') as file:
    file.write(updated_content)

# Line numbers with errorneous tags
lines_to_remove = list(range(5080477, 5080486))

with open(efcamdat_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

for line in lines[lines_to_remove]:
    print(line)
# CAUTION: do not execute this twice, it will remove other lines then
#del lines[lines_to_remove] # remove them

# Write the modified content back to the file
with open(efcamdat_path, 'w', encoding='utf-8') as file:
    file.writelines(lines)


KeyboardInterrupt: 

In [145]:
def extract_uncorrected_text(element):
    """Extracts the uncorrected text from the <text> element, ignoring corrections."""
    parts = []

    if element.text:
        parts.append(element.text)
        
    for sub_elem in element:
        if sub_elem.tag == 'change':
            selection = sub_elem.find('selection')
            if selection is not None and selection.text:
                parts.append(selection.text)
            if sub_elem.tail:
                parts.append(sub_elem.tail)
        else:
            if sub_elem.text:
                parts.append(sub_elem.text)
            if sub_elem.tail:
                parts.append(sub_elem.tail)
            
    return ''.join(parts)

In [176]:
data = []
context = ET.iterparse(efcamdat_path, events=('end',))

for event, elem in tqdm(context, total=36000000):
    if elem.tag == 'writing':
        text = extract_uncorrected_text(elem.find('text'))
        data.append({
            "id": elem.get('id'),
            "text": text.strip(),
            "level": int(elem.get('level'))
        })

efcamdat = pd.DataFrame(data)

  0%|          | 0/36000000 [00:00<?, ?it/s]

Let's print the columns

In [169]:
efcamdat.columns

Index(['id', 'text', 'level'], dtype='object')

In [178]:
grade_to_cefr = {
    range(1, 4): 'A1',
    range(4, 7): 'A2',
    range(7, 10): 'B1',
    range(10, 13): 'B2',
    range(13, 16): 'C1',
    range(16, 17): 'C2',
}

def map_grade_to_cefr(grade):
    for grade_range, cefr in grade_to_cefr.items():
        if grade in grade_range:
            return cefr
    return None
efcamdat['CEFR'] = efcamdat['level'].apply(map_grade_to_cefr)

efcamdat['CEFR'].value_counts(dropna=False)

A1    625985
A2    307995
B1    168361
B2     61329
C1     14698
C2      1940
Name: CEFR, dtype: int64

In [179]:
def map_text_to_num_words(text):
    return len(text.split())
    
efcamdat['num_words'] = efcamdat.text.apply(map_text_to_num_words)

In [180]:
efcamdat.groupby('CEFR').agg({"num_words": "mean"})

Unnamed: 0_level_0,num_words
CEFR,Unnamed: 1_level_1
A1,36.453861
A2,65.246595
B1,94.15266
B2,132.331393
C1,165.214315
C2,169.73299


In [181]:
efcamdat['num_words'].sum()

69640185

# Fine-tune

In [1]:
from peft import LoraConfig, PeftModel
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import json
import re
import datasets
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from collections import Counter
import nltk
nltk.download("punkt", download_dir=os.getenv('CACHE_DIR'))
nltk.data.path.insert(0, os.getenv('CACHE_DIR'))
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer

import sys
sys.path.append(f'../source')
import models
import helpers
import data

import importlib
#importlib.reload(models)

[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/home/dglandorf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /scratch/tmp.60668350.dglandorf...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
#model_name = "/cluster/scratch/dglandorf/models/llama3_FT_A1/checkpoint-1300"
preprossed_dataset_file = '../data/CEFR_texts.jsonl'
preprossed_dialog_file = '../data/CEFR_dialogs.jsonl'

checkpoint_dir = '/cluster/scratch/dglandorf/models/'

In [11]:
model, tokenizer = models.load_generator(model_name)
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Preprocess texts to dialogs

In [1]:
# Generate dialogs with these phrases
dialogs = []
dataset = datasets.load_dataset('json', data_files=preprossed_dataset_file, split='train')
dataset = dataset.shuffle()
for item in dataset:
    if len(dialogs) > 100: break
    print(item)
    chat_messages = tokenizer.apply_chat_template([{"role": "user", "content": f"Write a dialog using exact phrases including mistakes from this text: {item['text']}. Do not explain mistakes."}], tokenize=False, add_generation_prompt=True)
    response = models.generate(model, tokenizer, [chat_messages], terminators, max_new_tokens=256)
    dialog = [utterance.strip() for utterance in response.split("\n")]
    
    try:
        cleaned = [re.search(r'.*: (.*)', turn).group(1) for turn in dialog[1:-1] if len(turn)>3]
        print(cleaned)
        if len(cleaned):
            dialogs.append({"CEFR": item['CEFR'],
                            "writing": item['text'],
                            "dialog": cleaned})
        else:
            print(response)
    except:
        print(response)

NameError: name 'datasets' is not defined

In [6]:
len(dialogs)

9

In [13]:
dialogs_df = pd.read_json("../data/CEFR_dialogs.json")

In [14]:
len(dialogs_df)

6591

In [15]:
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [16]:
snippets = []
for idx, dialog in tqdm(dialogs_df.iterrows(), total=len(dialogs_df)):
    scores = [scorer.score(dialog['dialog'][i], dialog['writing'])['rougeL'][1] for i in range(1, len(dialog['dialog']))]
    if not len(scores): continue
    threshold = sorted(scores, reverse=True)[max(0, len(scores)//2-1)]
    for i in range(1, len(dialog['dialog'])):
        if scores[i-1]<threshold: continue
        snippet = {"writing": dialog['writing'],
                   "CEFR": dialog['CEFR'],
                   'context': dialog['dialog'][:i],
                   'response': dialog['dialog'][i]}
        snippets.append(snippet)

if not os.path.exists(preprossed_dialog_file) or True:
    with open(preprossed_dialog_file, 'w') as f:
        for snippet in tqdm(snippets):
            f.write(json.dumps(snippet) + '\n')

  0%|          | 0/6591 [00:00<?, ?it/s]

  0%|          | 0/21289 [00:00<?, ?it/s]

In [17]:
len(snippets)

21289

In [18]:
description = {
    "C2": "Has a good command of idiomatic expressions and colloquialisms with awareness of connotative levels of meaning. Can convey finer shades of meaning precisely by using, with reasonable accuracy, a wide range of modification devices. Can backtrack and restructure around a difficulty so smoothly that the interlocutor is hardly aware of it.",
    "C1": "Can express themselves fluently and spontaneously, almost effortlessly. Has a good command of a broad lexical repertoire allowing gaps to be readily overcome with circumlocutions. There is little obvious searching for expressions or avoidance strategies; only a conceptually difficult subject can hinder a natural, smooth flow of language.",
    "B2": "Can interact with a degree of fluency and spontaneity that makes regular interaction, and sustained relationships with users of the target language, quite possible without imposing strain on either party. Can highlight the personal significance of events and experiences, and account for and sustain views clearly by providing relevant explanations and arguments.",
    "B1": "Can communicate with some confidence on familiar routine and non-routine matters related to their interests and professional field. Can exchange, check and confirm information, deal with less routine situations and explain why something is a problem. Can express thoughts on more abstract, cultural topics such as films, books, music, etc.",
    "A2": "Can interact with reasonable ease in structured situations and short conversations, provided the other person helps if necessary. Can manage simple, routine exchanges without undue effort; can ask and answer questions and exchange ideas and information on familiar topics in predictable everyday situations.",
    "A1": "Can interact in a simple way but communication is totally dependent on repetition at a slower rate, rephrasing and repair. Can ask and answer simple questions, initiate and respond to simple statements in areas of immediate need or on very familiar topics."
}


def get_CEFR_prompt(item, apply_chat_template=None, system_msg=False):
    next_speaker = "A" if len(item['context']) % 2 == 0 else "B"
    
    instruction = f"Given the dialog, write a possible next turn of {next_speaker} that an English learner on CEFR level {item['CEFR']} could produce:"
    item = helpers.get_messages(instruction, item, apply_chat_template, system_msg, next_speaker)
    item['messages'] = [{"role": "system", "content": f"Only output {next_speaker}'s response using language on CEFR level {item['CEFR']}. This level is described as: {description[item['CEFR']]}"}] + item['messages']
    item['prompt'] = apply_chat_template(item['messages'][:-1], tokenize=False, add_generation_prompt=True)
    item['text'] = apply_chat_template(item['messages'], tokenize=False)
    return item

In [19]:
dataset = datasets.load_dataset('json', data_files=preprossed_dialog_file, split='train')
dataset = dataset.map(get_CEFR_prompt,
                      fn_kwargs={"apply_chat_template": tokenizer.apply_chat_template,
                                 "system_msg": False})

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/21289 [00:00<?, ? examples/s]

In [20]:
Counter(dataset['CEFR'])

Counter({'A1': 10371, 'A2': 6395, 'B2': 1150, 'B1': 3065, 'C2': 35, 'C1': 273})

In [24]:
item = dataset[8000]
print(item['writing'])
print(item['text'])

In the summer I prefer to wear adress and use shorts.In the winter I use blose with jeans and jacket.I like black blouse, white,green, pink ... I have almost every color.
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Only output B's response using language on CEFR level A1. This level is described as: Can interact in a simple way but communication is totally dependent on repetition at a slower rate, rephrasing and repair. Can ask and answer simple questions, initiate and respond to simple statements in areas of immediate need or on very familiar topics.<|eot_id|><|start_header_id|>user<|end_header_id|>

Given the dialog, write a possible next turn of B that an English learner on CEFR level A1 could produce:
Dialog:
A: "Hey, what do you like to wear in the summer?"
B: "In the summer I prefer to wear adress and use shorts."
A: "Adress? What's that?"
B: "Yeah, you know, adress. I like it with shorts."
A: "Hmm, I'm not sure what you mean. What do you wear in the winter?"
B: 

In [109]:
train_test_split = dataset.train_test_split(test_size=256 if len(dataset)>1024 else 0.2)
train_dataset, test_dataset = train_test_split['train'], train_test_split['test']

### Load dataset for direct text processing (without transformation to dialogs)

In [10]:
if not os.path.exists(preprossed_dataset_file):
    with open(preprossed_dataset_file, 'w') as f:
        for idx, row in tqdm(efcamdat.iterrows(), total=len(efcamdat)):
            item = {
                "CEFR": row['CEFR'],
                "text": str(row['text']),
            }
            f.write(json.dumps(item) + '\n')

In [9]:
level="B2"
dataset = datasets.load_dataset('json', data_files=preprossed_dataset_file, split='train')
#dataset = dataset.filter(lambda item: item['CEFR']==level) # optionally only for one level
train_test_split = dataset.train_test_split(test_size=256 if len(dataset)>1024 else 0.2)
train_dataset, test_dataset = train_test_split['train'], train_test_split['test']

In [6]:
def formatting_func(example):
    text = ""
    for sentence in sent_tokenize(example['text']):
        text += f"Write a sentence on CEFR level {example['CEFR']}: {sentence}{tokenizer.eos_token}"
    return text

## Actual fine-tuning procedure

In [15]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

In [8]:
#output_dir = f'{checkpoint_dir}CEFR_dialogs'
output_dir = f'{checkpoint_dir}CEFR_{level}'
max_epochs = 1
batch_size = 1
grad_acc_steps = 4 // batch_size
max_samples = min(50000, max_epochs * len(train_dataset))
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_acc_steps,
    optim="paged_adamw_32bit",
    logging_steps=10,
    learning_rate=1e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    #max_steps=max_samples//(batch_size*grad_acc_steps),
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb",
    run_name="gctg",
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=25,
    per_device_eval_batch_size=batch_size,
    save_strategy="steps",
    save_steps=25,
    save_total_limit=3,
    save_only_model=True,
    #metric_for_best_model="eval_test_constraint",
    #greater_is_better=True,
    eval_accumulation_steps=1
)

In [9]:
#train_subset = datasets.Dataset.from_dict(train_dataset[0:100000])
#eval_dataset = datasets.Dataset.from_dict(test_dataset[0:64])
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    args=training_arguments,
    packing=True,
    #formatting_func=formatting_func,
    #data_collator=DataCollatorForCompletionOnlyLM("[/INST] \nA:", tokenizer=tokenizer),
    #data_collator=DataCollatorForCompletionOnlyLM("<|start_header_id|>assistant<|end_header_id|>", tokenizer=tokenizer),
    #compute_metrics=compute_metrics,
    #neftune_noise_alpha=5,
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
trainer.train()#f"{output_dir}/checkpoint-550")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdo-gl[0m ([33mdomgla[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
25,2.9973,2.943759
50,2.7658,2.889825
75,2.7808,2.853077
100,2.757,2.797309
125,2.7786,2.770574
150,2.649,2.751113
175,2.631,2.734101
200,2.5807,2.721258
225,2.6136,2.71116
250,2.6932,2.70633


KeyboardInterrupt: 

# Inference tests

In [130]:
lvl_models = {
    "A1": f"{checkpoint_dir}CEFR_A1",
    "A2": f"{checkpoint_dir}CEFR_A2",
    "B1": f"{checkpoint_dir}CEFR_B1",
    "B2": f"{checkpoint_dir}CEFR_B2"
}

In [8]:
get_path = lambda lvl: lvl_models[lvl] + "/"+ ([d for d in os.listdir(lvl_models[lvl]) if os.path.isdir(os.path.join(lvl_models[lvl], d))][-1])

cefr_model = models.AutoModelForCausalLM.from_pretrained(get_path("A1"), adapter_name="A1", cache_dir=os.getenv('CACHE_DIR'), device_map="auto")

cefr_model.load_adapter(get_path("A2"), adapter_name="A2", device_map="auto")
cefr_model.load_adapter(get_path("B1"), adapter_name="B1", device_map="auto")
cefr_model.load_adapter(get_path("B2"), adapter_name="B2", device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [23]:
losses = {}
nr = 10003
item1 = train_dataset[nr]
print(item1['CEFR'])
print(item1['text'])
nr = 10166
item2 = train_dataset[nr]
print(item2['CEFR'])
print(item2['text'])

A1
I work to Spread Tecnologia, I am a sales manager,  main responsability is sales to public setor, I am working for six mounth, I visit customer in Belo Horizonte. I like may work. Before this job, I worked for Powerlogic for 6 years.
B2
From the future, the Transformer Translator arrived to change your life! With wonderful palm-size and the lowest weight of the category, this state-of-the-art gadget will be your new guide on trips around the world, translating 500 useful phrases in 50 languages, plus a section that translates on body language and corrects what you say! And, to help you, the TT has a world time zone clock, so you won't be late for your commitments. It also has a currency converter, saving the time that you would expense making counts about your money. But this sophisticated and modern gadget doesn't work only to help you with your job. Equipped with headphone set and world radio, it's perfect for the relaxing time that you can deserve. It also has a pedometer, so you

In [51]:
for lvl in lvl_models.keys():
    print(lvl)
    cefr_model.set_adapter(lvl)
    
    with models.torch.no_grad():
        model_input = tokenizer(item1['text'], return_tensors="pt").to(models.device)
        self.cefr_model.set_adapter(lvl)
        outputs = cefr_model(**model_input, labels=model_input.input_ids)
        loss = outputs.loss
        print(loss)
        losses[lvl] = [loss]
        model_input = tokenizer(item2['text'], return_tensors="pt").to(models.device)
        outputs = cefr_model(**model_input, labels=model_input.input_ids)
        loss = outputs.loss
        losses[lvl].append(loss)
        print(loss)

print(losses)

A1
{'input_ids': tensor([[128000,     40,    990,    311,  48816,  88320,  39073,     11,    358,
           1097,    264,   6763,   6783,     11,    220,   1925,   4294,   2968,
            374,   6763,    311,    586,    743,    269,     11,    358,   1097,
           3318,    369,   4848,    296,   1656,    339,     11,    358,   4034,
           6130,    304,   7984,     78,  15083,   6317,     68,     13,    358,
           1093,   1253,    990,     13,  13538,    420,   2683,     11,    358,
           6575,    369,   7572,  25205,    369,    220,     21,   1667,     13]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor(3.0110, device='cuda:0')
tensor(3.1717, device='cuda:0')
B2
{'input_ids': tensor([[128000,     40,    990,    311,  48816,  8832

In [29]:
dialog_data = data.get_dialog_data()


In [113]:
context, response, source, id = helpers.sample_dialog_snippet(dialog_data)

In [31]:
print(context)
for level in np.unique(dataset['CEFR']):
    print(level)
    item = {"context": context, "CEFR": level, "response": response}
    item = get_CEFR_prompt(item, apply_chat_template=tokenizer.apply_chat_template, system_msg=False)
    #print(item['prompt'])
    print(models.generate(model, tokenizer, [item['prompt']], terminators, max_new_tokens=32, skip_special_tokens=False))

['So they will allow individuals to adopt as well - would that be through a public or a private adoption?', "These changes are meant to have a permanent change in the parent and child's lives", 'I would imagine so - so many kids in foster care these days due to parents that have no business having children.', 'The modern adoption systems arose in the 20th century']
A1


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.59s/it]


A: Adoption, yes, good. Public or private, which one?<|eot_id|>
A2


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.94s/it]


A: Yeah, I think it's good they're making changes. More kids will have a stable home.<|eot_id|>
B1


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.46s/it]


A: Yeah, it's really sad. I think it's good that they're making changes to help more kids find a stable home. Do you think the
B2


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.48s/it]


Here's a possible next turn of A that an English learner on CEFR level B2 could produce:

A: That's true, and it's really sad
C1


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.47s/it]


Here's a possible next turn of A that an English learner on CEFR level C1 could produce:

A: Indeed, it's astonishing how societal attitudes towards
C2


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.47s/it]

Here's a possible next turn of A that an English learner on CEFR level C2 could produce:

A: Indeed, the notion of adoption has undergone a





In [31]:
def get_prompt(item, apply_chat_template=None, system_msg=True):
    next_speaker = "A" if len(item['context']) % 2 == 0 else "B"
    
    instruction = f"Given the dialog, write a possible next turn of {next_speaker}."
    item = helpers.get_messages(instruction, item, apply_chat_template, system_msg, next_speaker)
    return item

In [33]:
print(context)
for level in np.unique(dataset['CEFR']):
    print(level)
    item = {"context": context, "CEFR": level, "response": response}
    item = get_prompt(item, apply_chat_template=tokenizer.apply_chat_template)
    #print(item['prompt'])
    print(models.generate(model, tokenizer, [item['prompt']], terminators, max_new_tokens=64, skip_special_tokens=True))

['So they will allow individuals to adopt as well - would that be through a public or a private adoption?', "These changes are meant to have a permanent change in the parent and child's lives", 'I would imagine so - so many kids in foster care these days due to parents that have no business having children.', 'The modern adoption systems arose in the 20th century']
A1


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.37s/it]


I think it's likely that the adoption process would be a private one, given the emphasis on a permanent change in the parent-child relationship.<|eot_id|>
A2


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.28s/it]


I think it's likely that the adoption process would be a private one, given the emphasis on a permanent change in the parent-child relationship.<|eot_id|>
B1


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.28s/it]


I think it's likely that the adoption process would be a private one, given the emphasis on a permanent change in the parent-child relationship.<|eot_id|>
B2


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.28s/it]


I think it's likely that the adoption process would be a private one, given the emphasis on a permanent change in the parent-child relationship.<|eot_id|>
C1


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.28s/it]


I think it's likely that the adoption process would be a private one, given the emphasis on a permanent change in the parent-child relationship.<|eot_id|>
C2


Generate: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.28s/it]

I think it's likely that the adoption process would be a private one, given the emphasis on a permanent change in the parent-child relationship.<|eot_id|>





In [32]:
import torch
from transformers import LogitsProcessor

In [168]:
class CEFRLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer, cefr_model, levels, input_len, level, alpha):
        super().__init__()
        self.tokenizer = tokenizer
        self.cefr_model = cefr_model
        self.levels = levels
        self.input_len = input_len
        self.level = level
        self.alpha = alpha

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        candidate_tokens, _ = models.get_top_p_tok_k(scores, top_k=50)
        new_scores = float('-inf') * torch.ones_like(scores)
        if 128009 in candidate_tokens[:5]: # end of sequence likely
            new_scores[:,128009] = 1.
            return new_scores
        
        candidate_sequences = torch.cat([input_ids[:,self.input_len:].expand(len(candidate_tokens), -1), candidate_tokens.unsqueeze(1)], dim=-1)
        candidates = self.tokenizer.batch_decode(candidate_sequences)
        #print(candidates)
        
        model_input = tokenizer(candidates, return_tensors="pt").to(models.device)
        #print(model_input)
        last_logits = {}
        for lvl in self.levels:
            self.cefr_model.set_adapter(lvl)
            outputs = self.cefr_model(**model_input, labels=model_input.input_ids)
            last_logits[lvl] = outputs.logits.gather(2, model_input.input_ids.unsqueeze(-1)).squeeze(-1)[:,-1]

        #print(CEFR_logits)
        CEFR_scores = last_logits[level]
        #print(CEFR_scores)
        CEFR_sums = torch.zeros_like(last_logits[level])
        for lvl in self.levels:
            if lvl != level:
                CEFR_sums += last_logits[lvl]
        #print(list(zip(CEFR_scores, candidates)))
        #print(list(zip(CEFR_sums / (len(self.levels)-1), candidates)))
        #CEFR_scores -= CEFR_sums / (len(self.levels)-1)

        print(candidates[torch.argmax(CEFR_scores)])

        # choose the logit that has the highest difference to the others
        new_scores[:,candidate_tokens[torch.argmax(CEFR_scores)]] = 1.
        print(len(candidates))
        #new_scores[:,candidate_tokens] = scores[:,candidate_tokens]
        #for nr, score in CEFR_losses.items():
        #    CEFR_logits = torch.log(score[0]/score[0].sum() + 1e-9).to(models.device)
        #    CEFR_logits = grammar_logits - grammar_logits.mean() # re-center logits
        #    new_scores[:,candidate_tokens] = new_scores[:,candidate_tokens] + self.alpha * CEFR_logits
        return new_scores

def decoding(model, tokenizer, prompt, cefr_model, levels, level, do_sample=False, alpha=1):
    model_input = tokenizer(prompt, return_tensors="pt").to(models.device)
    input_len = model_input.input_ids.shape[1]
    kwargs = {"logits_processor": [CEFRLogitsProcessor(tokenizer, cefr_model, levels, input_len, level, alpha)],
              "renormalize_logits": True}
    
    token_ids = model.generate(**model_input,
                               max_new_tokens=128,
                               pad_token_id=tokenizer.eos_token_id,
                               eos_token_id=terminators,
                               do_sample=do_sample,
                               temperature=1 if do_sample else None,
                               top_p=0.95 if do_sample else None,
                               top_k=300 if do_sample else None,
                               **kwargs)
    return tokenizer.batch_decode(token_ids[:,input_len:], skip_special_tokens=True)[0]

In [169]:
context, response, source, id = helpers.sample_dialog_snippet(dialog_data)

In [170]:
item = {"context": context, "CEFR": level, "response": response}
item = get_prompt(item, apply_chat_template=tokenizer.apply_chat_template)
text = item['prompt']
print(text)

result = {lvl: decoding(model, tokenizer, item['prompt'], cefr_model, lvl_models.keys(), lvl) for lvl in lvl_models.keys()}

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Only output A's response.<|eot_id|><|start_header_id|>user<|end_header_id|>

Given the dialog, write a possible next turn of A.
Dialog:
A: I live not too far away from Allen.  The community is affluent.
B: As Americans we really take football seriously. The highest paid in the US Department defense are three football coaches from services branches.
A: Yes, we take football too seriously.  Do you know the football team that has the highest score ever? 
B: Hmm, I'm not sure.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


"I
29
"I believe
3
"I believe it
2
"I believe it was
2
"I believe it was 
28
"I believe it was 201
50
"I believe it was 2019
6
"I believe it was 2019 and
5
"I believe it was 2019 and it
24
"I believe it was 2019 and it was
1
"I believe it was 2019 and it was 
50
"I believe it was 2019 and it was 111
50
"I believe it was 2019 and it was 111 to
3
"I believe it was 2019 and it was 111 to 
1
"I believe it

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [166]:
result

{'A1': '"We mostly work on routine checkups and diagnoses, though, no need to wait rooms drama, let\'s get on to cases, how can you describe this info concept, does it give you sense, feel out, so help us, help us feel, feel, give us to guide',
 'A2': '"I\'m at Memorial Healthcare. Yes, district hospital typically, by designation, services, referral complexity would, of all, would that, if, or are, or the, and,',
 'B1': "Unfortunately, our information for response from 'A' at the specified response of '<A at the stated character of '<I''",
 'B2': 'Unfortunately, our information suggests a type C designation makes much of a difference'}

In [167]:
for lvl, text in result.items():
    print(lvl)
    for lvl_test in lvl_models.keys():
        print(lvl_test)
        cefr_model.set_adapter(lvl_test)
        with models.torch.no_grad():
            model_input = tokenizer(text, return_tensors="pt").to(models.device)
            outputs = cefr_model(**model_input, labels=model_input.input_ids)
            loss = outputs.loss
            print(loss)

A1
A1
tensor(4.5044, device='cuda:0')
A2
tensor(4.6348, device='cuda:0')
B1
tensor(4.7345, device='cuda:0')
B2
tensor(4.5746, device='cuda:0')
A2
A1
tensor(5.3446, device='cuda:0')
A2
tensor(5.6318, device='cuda:0')
B1
tensor(5.4935, device='cuda:0')
B2
tensor(5.3641, device='cuda:0')
B1
A1
tensor(5.7747, device='cuda:0')
A2
tensor(5.9723, device='cuda:0')
B1
tensor(6.2233, device='cuda:0')
B2
tensor(5.9677, device='cuda:0')
B2
A1
tensor(5.8016, device='cuda:0')
A2
tensor(5.9299, device='cuda:0')
B1
tensor(6.0253, device='cuda:0')
B2
tensor(5.8694, device='cuda:0')
