In [2]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/9a/06/e4ec2a321e57c03b7e9345d709d554a52c33760e5015fdff0919d9459af0/transformers-4.35.0-py3-none-any.whl.metadata
  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.1/123.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.16.4 from https://files.pythonhosted.org/packages/ef/b5/b6107bd65fa4c96fdf00e4733e2fe5729bb9e5e09997f63074bb43d3ab28/huggingface_hub-0.18.0-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.18.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Obtaining dependency information for regex!=2019.12.17 from https://files.pythonhosted.org/packages/8f/3e/4b8b40eb3c80aeaf360f0361d956d129bb3d23b2a3ecbe3a04a8f3bdd6d

## Download Dataset

In [3]:
!wget "https://www.dropbox.com/scl/fi/525gv6tmdi3n32mipo6mr/input.zip?rlkey=5jdsxahphk2ped5wxbxnv0n4y&dl=1" -O input.zip

--2023-11-08 06:11:01--  https://www.dropbox.com/scl/fi/525gv6tmdi3n32mipo6mr/input.zip?rlkey=5jdsxahphk2ped5wxbxnv0n4y&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.18, 2620:100:6030:18::a27d:5012
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc1704a7a42d1ecc97ace73260fa.dl.dropboxusercontent.com/cd/0/inline/CHKlgmfPs5xmpae6mqZx9E5dkNXAacxUijyP2cLhkyw8fN3IrJihLlWaOGZUaanX5M3RzRX5BGcZW7Y2KtP4omgSWj38v_Eyvrk6wX6eCYfqUHZja9ZSGgaam-5ENoL5FOU0n6Lnoqp58XN2u1_qAQKb/file?dl=1# [following]
--2023-11-08 06:11:02--  https://uc1704a7a42d1ecc97ace73260fa.dl.dropboxusercontent.com/cd/0/inline/CHKlgmfPs5xmpae6mqZx9E5dkNXAacxUijyP2cLhkyw8fN3IrJihLlWaOGZUaanX5M3RzRX5BGcZW7Y2KtP4omgSWj38v_Eyvrk6wX6eCYfqUHZja9ZSGgaam-5ENoL5FOU0n6Lnoqp58XN2u1_qAQKb/file?dl=1
Resolving uc1704a7a42d1ecc97ace73260fa.dl.dropboxusercontent.com (uc1704a7a42d1ecc97ace73260fa.dl.dropboxusercontent.com)... 162.1

In [4]:
!unzip input.zip

Archive:  input.zip
   creating: input/
  inflating: input/train.csv         
  inflating: input/valid.csv         


## Imports

In [5]:
import torch

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

## Configurations

In [6]:
MODEL = 't5-small'
BATCH_SIZE = 48
NUM_PROCS = 16
EPOCHS = 10
OUT_DIR = 'results_t5small'
MAX_LENGTH = 256 # Maximum context length to consider while preparing dataset.

## Prepare Dataset

In [7]:
dataset_train = load_dataset(
    'csv', 
    data_files='input/train.csv', 
    split='train'
)
dataset_valid = load_dataset(
    'csv', 
    data_files='input/valid.csv', 
    split='train'
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['Id', 'Title', 'Body', 'Tags', 'CreationDate', 'Y'],
    num_rows: 45000
})
Dataset({
    features: ['Id', 'Title', 'Body', 'Tags', 'CreationDate', 'Y'],
    num_rows: 15000
})


In [9]:
print(dataset_train[0])

{'Id': 34552656, 'Title': 'Java: Repeat Task Every Random Seconds', 'Body': '<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n', 'Tags': '<java><repeat>', 'CreationDate': '2016-01-01 00:21:59', 'Y': 'LQ_CLOSE'}


### Tokenization

In [10]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"assign tag: {title} {body}" for (title, body) in zip(examples['Title'], examples['Body'])]
    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    cleaned_tag = [' '.join(''.join(tag.split('<')).split('>')[:-1]) for tag in examples['Tags']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            cleaned_tag, 
            max_length=MAX_LENGTH, 
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function, 
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function, 
    batched=True,
    num_proc=NUM_PROCS
)

Map (num_proc=16):   0%|          | 0/45000 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=16):   0%|          | 0/15000 [00:00<?, ? examples/s]



In [12]:
print(tokenized_train[0])

{'Id': 34552656, 'Title': 'Java: Repeat Task Every Random Seconds', 'Body': '<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n', 'Tags': '<java><repeat>', 'CreationDate': '2016-01-01 00:21:59', 'Y': 'LQ_CLOSE', 'input_ids': [12317, 7860, 10, 10318, 10, 20469, 16107, 2181, 25942, 5212, 7, 3, 2, 102, 3155, 196, 31, 51, 641, 3324, 28, 6103, 53, 4145, 334, 3, 29, 3978, 57, 338, 10318, 5, 13780, 5, 13368, 52, 11, 10318, 5, 13780, 5, 13368, 52, 382, 9, 7, 157, 5, 299, 8857, 497, 27, 241, 12, 2281, 96, 566, 7126, 1150, 121, 12, 8, 8990, 334, 6504, 3978, 45, 209, 4525, 5, 4877, 27, 31, 51, 16, 3, 9, 720, 13, 3, 9, 10505, 11, 278, 31, 17, 43, 136, 1081, 12, 504, 78, 623, 5, 2372, 199, 133, 36, 3, 9, 2246, 4915, 1054, 5, 3, 2, 87, 1

## Model

In [13]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

60,506,624 total parameters.
60,506,624 training parameters.


## Training

In [14]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=5,
    report_to='tensorboard',
    learning_rate=0.0001,
    fp16=True,
    dataloader_num_workers=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)

history = trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
500,0.0953,0.080837
1000,0.0745,0.066722
1500,0.0701,0.061305
2000,0.0657,0.057793
2500,0.0572,0.0554
3000,0.0586,0.05365
3500,0.0592,0.052196
4000,0.0534,0.050981
4500,0.0529,0.049928
5000,0.0572,0.04918


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [15]:
tokenizer.save_pretrained(OUT_DIR)

('results_t5small/tokenizer_config.json',
 'results_t5small/special_tokens_map.json',
 'results_t5small/spiece.model',
 'results_t5small/added_tokens.json')

## Inference

In [16]:
!wget "https://www.dropbox.com/scl/fi/9brsjizymq5zvqi7hff09/inference_data.zip?rlkey=ukmdy5egmdld80r5hhmsja78v&dl=1" -O inference_data.zip

--2023-11-08 07:47:35--  https://www.dropbox.com/scl/fi/9brsjizymq5zvqi7hff09/inference_data.zip?rlkey=ukmdy5egmdld80r5hhmsja78v&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.18, 2620:100:6030:18::a27d:5012
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ucb8f5d590b9cff38e7b279767f0.dl.dropboxusercontent.com/cd/0/inline/CHKmPq4pDvYcp1JWemQFiSBOiC6q5w_icrL0vSG6Y1coa6ejAlaz7n1Is9E8zYRWrj7VJyJkTSe0cRGvjiF3nzpY2qPr-Pcup4sfg9dGu5l1nEuZTbUqzygIy4t7nDe8K0q5gYuy8804K2asQVFreSfe/file?dl=1# [following]
--2023-11-08 07:47:36--  https://ucb8f5d590b9cff38e7b279767f0.dl.dropboxusercontent.com/cd/0/inline/CHKmPq4pDvYcp1JWemQFiSBOiC6q5w_icrL0vSG6Y1coa6ejAlaz7n1Is9E8zYRWrj7VJyJkTSe0cRGvjiF3nzpY2qPr-Pcup4sfg9dGu5l1nEuZTbUqzygIy4t7nDe8K0q5gYuy8804K2asQVFreSfe/file?dl=1
Resolving ucb8f5d590b9cff38e7b279767f0.dl.dropboxusercontent.com (ucb8f5d590b9cff38e7b279767f0.dl.dropboxusercontent.com)

In [17]:
!unzip inference_data.zip

Archive:  inference_data.zip
   creating: inference_data/
  inflating: inference_data/text_1.txt  
  inflating: inference_data/text_2.txt  
  inflating: inference_data/text_3.txt  
  inflating: inference_data/text_4.txt  
  inflating: inference_data/text_5.txt  
  inflating: inference_data/text_6.txt  


In [18]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

import os

In [19]:
model_path = 'results_t5small/checkpoint-9000/'  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained('results_t5small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
def do_correction(text, model, tokenizer):
    input_text = f"assign tag: {text}"
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )

    # Get correct sentence ids.
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )

    # Decode.
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    return corrected_sentence

In [21]:
for file in os.listdir('inference_data/'):
    f = open(f"inference_data/{file}", 'r')
    sentence = f.read()
    corrected_sentence = do_correction(sentence, model, tokenizer)
    print(f"QUERY: {sentence}\nTAGS: {corrected_sentence}")
    print('-'*100)

QUERY: Repeat Task Every Random Seconds <p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print \"Hello World\" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated

TAGS: java
----------------------------------------------------------------------------------------------------
QUERY: I have a C++ program that I compile on Mac OS 13.4.1 using Cmake

One of my users has the following error: dyld: cannot load 'my_program' (load command 0x80000034 is unknown)

I have no idea why he has this error message, he is on Mac OS 10.14.6 and we both use an Intel Mac

Here are some information about the binary that might be useful:

otool -L my_program
my_program:
        /System/Library/Frameworks/OpenCL.framework/Versions/A/OpenCL (compatibility version 1.0.0, current version 1.0.0)
        /usr/lib/libc++.1.dylib 