<a href="https://colab.research.google.com/github/rafaeljosem/MNA-ProyectoIntegrador_EQ10/blob/main/finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Instalación de dependencias
!pip install transformers datasets peft accelerate evaluate nltk numpy pandas

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator
import evaluate
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm

%load_ext cudf.pandas

In [None]:
nltk.download('punkt')

In [None]:
# Carga y preprocesamiento de datos
df = pd.read_csv('MexicanLaws_Clean_Compiled_PrePro_DataSet.csv')
df = df[['File Name', 'Text', 'Tokens']]
df = df.dropna().reset_index(drop=True)
df.columns = ['file_name', 'text', 'tokens']

In [None]:
# Tokenización y formateo de datos
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def format_prompt(text):
  return f"<s>[INST] {text}  [/INST]"

def format_completion(text):
  return f"[RESP] {text} [/RESP]"

def tokenize(examples):
  prompts = [format_prompt(text) for text in examples['text']]
  completions = [format_completion(text) for text in examples['text']]

  tokenized_prompts = tokenizer(prompts, max_length=512, truncation=True)
  tokenized_completions = tokenizer(completions, max_length=512, truncation=True)

  examples['input_ids'] = tokenized_prompts['input_ids']
  examples['labels'] = tokenized_completions['input_ids']

  return examples

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True, remove_columns=['file_name', 'text', 'tokens'])

train_dataset = dataset.train_test_split(test_size=0.2)
train_dataset = train_dataset['train']
val_dataset = dataset['test']

In [None]:
# Carga del modelo base
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  load_in_8bit=True,
  device_map='auto',
)

In [None]:
# Configuración de LoRA
lora_config = LoraConfig(
  r=16,
  lora_alpha=32,
  lora_dropout=0.05,
  bias="none",
  task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Preparación del entrenamiento
training_args = TrainingArguments(
  output_dir='./lora-mexican-laws',
  learning_rate=3e-4,
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  weight_decay=0.02,
  evaluation_strategy='steps',
  eval_steps=200,
  save_strategy='steps',
  save_steps=200,
  save_total_limit=3,
  logging_steps=50,
  report_to='wandb',
)

In [None]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
# Fine-tuning
trainer.train()

In [None]:
# Evaluación
metric = evaluate.load('rouge')

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  return result

trainer.evaluate(eval_dataset=val_dataset, compute_metrics=compute_metrics)

In [None]:
# Inferencia
def inference(prompt, max_length=512):
  print(f"Prompt: {prompt}")
  input_ids = tokenizer(format_prompt(prompt), return_tensors='pt').input_ids.cuda()
  response = model.generate(input_ids, max_length=max_length)
  print(f"Response: {tokenizer.decode(response[0], skip_special_tokens=True)}")

inference("¿Quién escribió la Constitución Mexicana?")

In [None]:
# Empaquetado y compartición
model.save_pretrained('./lora-mexican-laws')
tokenizer.save_pretrained('./lora-mexican-laws')