In [1]:
MODEL_NAME = 'microsoft/deberta-v3-large'
output_filename = f'/content/drive/MyDrive/OM/microsoft/deberta-v3-large-english-language-learning-phraseology'
FOLD = 0
MAX_LENGTH = 512
BATH = 4
EPOCHS = 2
STEP = 400

In [2]:
!pip install --upgrade wandb



In [3]:
import wandb
wandb.login()

wandb.init(project='llmfine', config={"autolog": True})


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrafaelszimmermann[0m ([33mrafaelsudbrackzimmermann[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Montando ambiente

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import importlib.util
def is_installed(package_name):
    """ Verifica se um pacote está instalado """
    spec = importlib.util.find_spec(package_name)
    return spec is not None
# Verifica e instala 'datasets' se necessário
if not is_installed('datasets'):
    !pip install datasets
    !pip install accelerate -U

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [6]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

from datasets import Dataset
from sklearn.metrics import cohen_kappa_score

# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



# separa os dados

In [8]:
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def extract_file(zip_ref, file, destination):
    """Função auxiliar para extrair um único arquivo."""
    zip_ref.extract(file, destination)

def extract_zip_parallel(zip_path, extract_path):
    """
    Extrai arquivos de um arquivo ZIP em paralelo, mantendo a estrutura de diretórios.

    :param zip_path: Caminho do arquivo ZIP a ser extraído.
    :param extract_path: Diretório de destino para extrair os arquivos.
    """
    # Criar o diretório de destino se não existir
    os.makedirs(extract_path, exist_ok=True)

    # Descompactar com barra de progresso
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Lista de todos os arquivos e diretórios no arquivo zip
        zip_files = zip_ref.namelist()

        # Iniciar a barra de progresso
        with tqdm(total=len(zip_files), desc="Descompactando", unit="file") as pbar:
            # Usar ThreadPoolExecutor para extrair arquivos em paralelo
            with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
                # Agendar a extração dos arquivos
                futures = [executor.submit(extract_file, zip_ref, file, extract_path) for file in zip_files]
                # Aguardar a conclusão de cada extração e atualizar a barra de progresso
                for future in futures:
                    future.result()  # Aguarda a conclusão da extração de um arquivo
                    pbar.update(1)

# Exemplo de uso
zip_path = '/content/drive/MyDrive/OM/datasets/feedback-prize-english-language-learning.zip'
extract_path = 'learning'
extract_zip_parallel(zip_path, extract_path)

Descompactando: 100%|██████████| 3/3 [00:00<00:00,  7.18file/s]


In [9]:
df_train = pd.read_csv('/content/learning/train.csv')

df_train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [10]:
# Mostrando as primeiras linhas do DataFrame para verificar
display(df_train['phraseology'].value_counts())
display(df_train['conventions'].value_counts())

Unnamed: 0_level_0,count
phraseology,Unnamed: 1_level_1
3.0,1153
3.5,929
2.5,772
4.0,553
2.0,350
4.5,108
5.0,25
1.5,11
1.0,10


Unnamed: 0_level_0,count
conventions,Unnamed: 1_level_1
3.0,1151
3.5,908
2.5,784
4.0,484
2.0,402
4.5,122
5.0,25
1.5,20
1.0,15


In [11]:
def safe_hex_conversion(x):
    try:
        # Tenta converter diretamente para inteiro em base 16
        return int(x, 16)
    except ValueError:
        # Se falhar, tenta interpretar como um número float e depois converter para inteiro
        try:
            return int(float(x))
        except ValueError:
            # Se ambos falharem, retorna um valor padrão ou gera um erro específico
            return 0  # ou raise ValueError(f"Não foi possível converter o valor: {x}")


In [12]:
def round_to_half(number):
    return np.round(number * 2) / 2

def map_predictions(preds):
    mapping = {0.0:0, 0.5: 1, 1.0: 2, 1.5: 3, 2.0: 4, 2.5: 5, 3.0: 6, 3.5: 7, 4.0: 8}
    rounded_preds = np.round(preds * 2) / 2
    mapped_preds = np.array([mapping[p] for p in rounded_preds], dtype=int)
    return mapped_preds

def compute_metrics(p):
    preds, labels = p

    rounded_predictions = round_to_half(preds)

    mapped_preds = map_predictions(rounded_predictions)
    mapped_labels = map_predictions(labels)
    # print(mapped_preds)
    # print(mapped_labels)

    score = cohen_kappa_score(mapped_labels, mapped_preds, weights='quadratic')
    return { 'qwk':score }

train_args = TrainingArguments(
    output_dir='deberta-large-fold0',
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATH,
    per_device_eval_batch_size=BATH,
    gradient_accumulation_steps=4,
    report_to="none",
    evaluation_strategy="steps",
    do_eval=True,
    eval_steps=STEP,
    save_steps=STEP,
    logging_steps=STEP,
    save_total_limit=1,
    save_strategy="steps",
    lr_scheduler_type='linear',
    metric_for_best_model="qwk",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_safetensors=True
)



# Treino

In [13]:
import shutil
m = 'small'
STEP = 100
for i, name_col in enumerate(['cohesion'
# ,	'syntax',	'vocabulary',	'phraseology',	'grammar',	'conventions'
]):
    display(df_train[name_col].value_counts())
    MODEL_NAME = f'microsoft/deberta-v3-{m}'
    output_filename = f'/content/drive/MyDrive/OM/microsoft/deberta-v3-{m}-english-language-learning-{name_col}'
    # tokenize os dados ----------------------------------------------------
    df_train['fold'] = df_train['text_id'].map(safe_hex_conversion).mod(5)
    df_train['labels'] = df_train[f'{name_col}'].map(lambda x: x-1)

    ds_train = Dataset.from_pandas(df_train[df_train.fold!=FOLD])
    ds_eval = Dataset.from_pandas(df_train[df_train.fold==FOLD])

    # Aplicando a tokenização e removendo as colunas indesejadas diretamente
    columns_to_keep = ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
    columns_to_remove = [col for col in ds_train.column_names if col not in columns_to_keep]

    ds_train = ds_train.map(tokenize).remove_columns(columns_to_remove)
    ds_eval = ds_eval.map(tokenize).remove_columns(columns_to_remove)

    # args -----------------------------------------------------------------
    train_args = TrainingArguments(
        output_dir=f'deberta-{m}-fold{i}',
        fp16=True,
        learning_rate=2e-5,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATH,
        per_device_eval_batch_size=BATH,
        gradient_accumulation_steps=4,
        report_to="wandb",  # Atualizado para registrar no wandb
        evaluation_strategy="steps",
        do_eval=True,
        eval_steps=STEP,
        save_steps=STEP,
        logging_steps=STEP,
        save_total_limit=1,
        save_strategy="steps",
        lr_scheduler_type='linear',
        metric_for_best_model="qwk",
        greater_is_better=True,
        warmup_ratio=0.1,
        weight_decay=0.01,
        save_safetensors=True
    )

    # model ----------------------------------------------------------------
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=1,
        # ignore_mismatched_sizes=True,
        )

    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=ds_train,
        eval_dataset=ds_eval,
        data_collator=DataCollatorWithPadding(tokenizer),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # treino -------------------------------------------------------------------------------------
    trainer.train()
    # salva --------------------------------------------------------------------------------------
    output_dir = output_filename.replace('.zip', '')
    shutil.make_archive(output_dir, 'zip', f'/content/deberta-{m}-fold{i}')
    artifact = wandb.Artifact(name='model_artifact', type='model')
    artifact.add_file(output_filename + '.zip')
    wandb.log_artifact(artifact)


Unnamed: 0_level_0,count
cohesion,Unnamed: 1_level_1
3.0,1096
3.5,988
2.5,790
4.0,534
2.0,315
4.5,125
1.5,27
5.0,26
1.0,10


Map:   0%|          | 0/3128 [00:00<?, ? examples/s]

Map:   0%|          | 0/783 [00:00<?, ? examples/s]



config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss,Qwk
100,2.0511,0.438098,0.208951
200,0.3006,0.281559,0.536553
300,0.2549,0.317053,0.545991


In [14]:
wandb.finish()

VBox(children=(Label(value='22.250 MB of 807.595 MB uploaded\r'), FloatProgress(value=0.02755089774099807, max…

0,1
eval/loss,█▁▃
eval/qwk,▁██
eval/runtime,▄█▁
eval/samples_per_second,▅▁█
eval/steps_per_second,▅▁█
train/epoch,▁▁▃▃▆▆█
train/global_step,▁▁▃▃▆▆█
train/grad_norm,█▂▁
train/learning_rate,█▅▁
train/loss,█▁▁

0,1
eval/loss,0.31705
eval/qwk,0.54599
eval/runtime,9.3327
eval/samples_per_second,83.899
eval/steps_per_second,21.001
total_flos,813395342562000.0
train/epoch,1.99488
train/global_step,390.0
train/grad_norm,11.79097
train/learning_rate,1e-05


In [None]:
import time

# Set the inactivity time in seconds
inactivity_time = 2 * 60

# Get the current time
start_time = time.time()

while True:
    # Check if the inactivity time has been reached
    if time.time() - start_time > inactivity_time:
        # Log out of Colab
        print("Logging out of Colab...")
        from google.colab import runtime
        runtime.unassign()
        break

    # Sleep for 1 minute
    time.sleep(60)