In [None]:
# instalar librerias

!pip install torch==2.2.2
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q bitsandbytes accelerate xformers einops
!pip install -q hf_transfer
!pip install trl
!pip install peft

Collecting torch==2.2.2
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.2)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2

In [None]:
# cargar modelo

model_name = 'meta-llama/Meta-Llama-3-8B'
access_token = ''

# parámetros alfa y rango
lora_alpha = 16
r = 16

# load LoRA configuration
from peft import LoraConfig
lora_config = LoraConfig(
    lora_alpha = lora_alpha,
    r = r,
    lora_dropout = 0.1,
    bias = 'none',
)

# training arguments
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
training_arguments = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1,
    optim = 'paged_adamw_32bit',
    save_steps = 25,
    logging_steps = 25,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    fp16 = False,
    bf16 = False,
    max_grad_norm = 0.3,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type = 'constant',
    report_to = 'tensorboard'
)

# bits and bytes configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = 'float16',
    bnb_4bit_use_double_quant = False,
)

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token = access_token,
    trust_remote_code = True
    )

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = 'right'

# Semilla para reproducibilidad de los experimentos
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [None]:
# cargar librerias y datos
import pandas as pd
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
import torch
import transformers
import numpy as np
from sklearn.model_selection import KFold
import datasets
from datasets import Dataset
from accelerate.utils import release_memory
import random

data = pd.read_csv('training_test_data_textFormat_en.csv')

In [None]:
# cross validation

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=False) # create folds

real_phrases = []
generated_phrases = []
lemmas = []

for train_index, test_index in kf.split(data):
    set_seed(7)
    training_data, test_data = data['text'][train_index], data['text'][test_index]

    training_data = Dataset.from_pandas(training_data.to_frame().reset_index())
    test_data = Dataset.from_pandas(test_data.to_frame().reset_index())

    # Offload previous model
    if 'model' in locals():
        model, trainer = release_memory(model, trainer)
        print('Offloaded previous model and trainer')

    # model
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                device_map='auto',
                                                quantization_config=bnb_config,
                                                token = access_token)

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    trainer = SFTTrainer(
      model = model,
      train_dataset = training_data,
      peft_config = lora_config,
      dataset_text_field = 'text',
      max_seq_length = 30,
      tokenizer = tokenizer,
      args = training_arguments,
      packing = True
    )

    trainer.train()

    for p in test_data:   # iterar sobre las frases de test
      text = p['text'].split('Phrase: ')
      prompt = text[0]  # prompt para el modelo con estructura Lemmas: {lista de lemmas} Phrase:
      ph = text[1]
      real_phrases.append(ph)
      lemmas.append(prompt.split('Lemmas: ')[1].strip('\n'))

      # introducimos al modelo las prompts de test
      inputs = tokenizer(prompt, return_tensors = 'pt')
      generate_ids = model.generate(inputs.input_ids, max_length = 50)
      output = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

      # si el modelo devuelve una secuencia con el formato prompt definido, tomamos como frase predicha lo que se encuentre después de "Phrase: "
      try:
        generated_phrases.append(output.split('Phrase: ')[1])
      except:
        generated_phrases.append('')  # si no sigue el formato definido lo tomamos como frase en blanco

In [None]:
# limpiar las frases y guardar los resultados
from bs4 import BeautifulSoup

clean = []

for i in generated_phrases:
  clean_text = BeautifulSoup(i, "lxml").text.rstrip()
  clean.append(clean_text)

results = pd.DataFrame()
results['lemmas'] = lemmas
results['real phrases'] = real_phrases
results['generated phrases'] = clean

results.to_csv(f'llama3_results_r_{r}_alpha_{lora_alpha}_n{n_splits}_en.csv')