In [None]:
pip install datasets transformers torch bert_score accelerate -U vaderSentiment --quiet sentence_transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import Dataset, load_dataset
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util
from bert_score import score
import re
import pandas as pd
from google.colab import drive
from itertools import product
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def load_and_preprocess_data(data, cantidad_ejemplos, desde=0):
    dataset = load_dataset(data)

    conversations = []
    i = 0
    for conversation in dataset['train']['conversations']:
        if i >= desde:
            human_conv = [conv['value'] for conv in conversation if conv['from'] == 'human']
            gpt_conv = [conv['value'] for conv in conversation if conv['from'] == 'gpt']

            for h, g in zip(human_conv, gpt_conv):
                # para pedirle el nombre al user cuando arranca la charla
                h = re.sub(r'\b(charlie|Charlie)\b', 'username', h)
                g = re.sub(r'\b(charlie|Charlie)\b', 'username', g)

                conversations.append(f"User: {h} Therapist: {g}")

        i += 1
        if i == desde + cantidad_ejemplos:
            break

    return pd.DataFrame({'conversation': conversations})

# Dividir el dataset en entrenamiento y prueba
def split_dataset(df, test_size=0.2):
    train_df, test_df = train_test_split(df.dropna(), test_size=test_size)
    return train_df, test_df

def preprocess_data(examples, tokenizer):
    inputs = tokenizer(
        examples['conversation'],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Clone the input_ids tensor to create labels
    inputs['labels'] = inputs['input_ids'].clone()

    # Set padding token ID to -100 so it's ignored in loss computation
    inputs['labels'][inputs['input_ids'] == tokenizer.pad_token_id] = -100

    # Convert tensors to lists for dataset compatibility
    for key in inputs:
        inputs[key] = inputs[key].squeeze().tolist()

    return inputs

def fine_tune_gpt2(train_dataset, tokenizer, device, output_dir, batch_num, size):
    # Load or initialize the model
    if batch_num == 0:
        model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(device)
    else:
        model = GPT2LMHeadModel.from_pretrained(os.path.join(output_dir, f"batch_{batch_num-1}")).to(device)

    # Set pad_token_id in the model config
    model.config.pad_token_id = tokenizer.pad_token_id

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,
        per_device_train_batch_size=8,
        fp16=True,
        logging_dir='./logs',
        logging_steps=500,
        dataloader_num_workers=2,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    print(f"Starting fine-tuning for batch size {size}")
    trainer.train()

    batch_output_dir = os.path.join(output_dir, f"batch_{batch_num}")
    if not os.path.exists(batch_output_dir):
        os.makedirs(batch_output_dir)
    model.save_pretrained(batch_output_dir)
    tokenizer.save_pretrained(batch_output_dir)
    print(f"Fine-tuned model saved to {batch_output_dir}")

    # Directorio en Google Drive donde se guardará el modelo
    gdrive_output_dir = f'/content/drive/My Drive/ModeloLLM2/batch_{batch_num}'

    # Crea el directorio si no existe
    if not os.path.exists(gdrive_output_dir):
        os.makedirs(gdrive_output_dir)

    # Guarda el modelo y el tokenizer en el directorio de Google Drive
    model.save_pretrained(gdrive_output_dir)
    tokenizer.save_pretrained(gdrive_output_dir)
    print(f"Fine-tuned model also saved to Google Drive at {gdrive_output_dir}")

# Ejecución del proyecto
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #DATASET ALEX
    train_df = load_and_preprocess_data("jerryjalapeno/nart-100k-synthetic",20000)

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium", padding_side='left')
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Convertir DataFrame a Dataset de datasets
    train_dataset = Dataset.from_pandas(train_df)
    train_dataset = train_dataset.map(lambda x: preprocess_data(x, tokenizer), batched=True)

    # Fine-tuning incremental
    output_dir = "./finetuned_model"
    batch_sizes = [10000, 20000]

    for batch_num, size in enumerate(batch_sizes):
        if batch_num == 0:
            start = 0
        else:
            start = batch_sizes[batch_num - 1]

        end = min(batch_sizes[batch_num], len(train_dataset))
        batch = train_dataset.select(range(start, end))
        fine_tune_gpt2(batch, tokenizer, device, output_dir, batch_num, size)

if __name__ == "__main__":
    main()

In [None]:
# Configuración de los parámetros de salida

def load_model_from_drive(device):
    model_dir = f'/content/drive/My Drive/ModelosLLM/batch_7'
    model = GPT2LMHeadModel.from_pretrained(model_dir).to(device)
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer


def output_params(temperature, top_k, repetition, ngram_size):
    output_param = {
      'max_new_tokens': 50,
      'top_k': top_k,
      'temperature': temperature,
      'repetition_penalty': repetition,
      'do_sample': True,
      'no_repeat_ngram_size': ngram_size,
    }
    return output_param

def generate_responses_batch(model, tokenizer, questions, device, temperature, top_k, repetition, ngram_size, batch_size=8):
    model.eval()
    responses = []
    for i in range(0, len(questions), batch_size):
        batch = questions[i:i + batch_size]
        prompts = [f"User: {question} Therapist:" for question in batch]
        inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model.generate(
                inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                pad_token_id=tokenizer.pad_token_id,
                **(output_params(temperature, top_k, repetition, ngram_size))
            )
        responses.extend([tokenizer.decode(output, skip_special_tokens=True).split("Therapist:")[1].strip() for output in outputs])
    return responses

def affective_content_analysis(responses):
    analyzer = SentimentIntensityAnalyzer()
    vader_scores = [analyzer.polarity_scores(response)['compound'] for response in responses]
    return vader_scores

def sentence_transformer_evaluation(human_responses, generated_responses):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    res = 0
    for i in range(len(human_responses)):
        embeddings = model.encode([human_responses[i], generated_responses[i]])
        cos_sim = util.cos_sim(embeddings[0], embeddings[1])
        res = res + cos_sim.item()
    return res / len(human_responses)

def evaluate_model(model, tokenizer, questions, reference_responses, device, temperature, top_k, repetition, ngram_size):
    generated_responses = generate_responses_batch(model, tokenizer, questions, device, temperature, top_k, repetition, ngram_size)
    vader_scores = affective_content_analysis(generated_responses)
    sentence_transformer_score = sentence_transformer_evaluation(reference_responses, generated_responses)
    return vader_scores, sentence_transformer_score

def evaluate_gpt2_base(test_df, device, temperature, top_k, repetition, ngram_size):
    base_model_name = "gpt2"
    base_model = GPT2LMHeadModel.from_pretrained(base_model_name).to(device)
    base_tokenizer = GPT2Tokenizer.from_pretrained(base_model_name)
    base_tokenizer.padding_side = 'left'
    base_tokenizer.pad_token = base_tokenizer.eos_token

    questions = [conv.split(" Therapist:")[0].replace("User: ", "") for conv in test_df['conversation'].tolist()]
    reference_responses = [conv.split(" Therapist:")[1].strip() for conv in test_df['conversation'].tolist()]
    print('Evaluando Modelo Base...')
    base_vader_scores, base_sentence_transformer_score = evaluate_model(base_model, base_tokenizer, questions, reference_responses, device, temperature, top_k, repetition, ngram_size)

    print("Resultados del modelo base GPT-2:")
    print("Promedio VADER:", sum(base_vader_scores) / len(base_vader_scores))
    print("Promedio Sentence Transformer:", base_sentence_transformer_score)

def evaluate_finetuned_models(test_df, temperature, top_k, repetition_penalties, ngram_sizes, device):
    questions = [conv.split(" Therapist:")[0].replace("User: ", "") for conv in test_df['conversation'].tolist()]
    reference_responses = [conv.split(" Therapist:")[1].strip() for conv in test_df['conversation'].tolist()]
    all_combinations = product(repetition_penalties, ngram_sizes)
    for repetition, ngram_size in all_combinations:
        finetuned_model, finetuned_tokenizer = load_model_from_drive(device)
        print('-------------')
        print(f'Evaluando Modelo Batch 7 with temperature {temperature}, top k{top_k}, repetition penalty {repetition}, ngram size {ngram_size}...')
        finetuned_vader_scores, finetuned_sentence_transformer_score = evaluate_model(finetuned_model, finetuned_tokenizer, questions, reference_responses, device, temperature, top_k, repetition, ngram_size)

        print(f"Resultados del modelo fine-tuneado GPT-2 batch 7 with temperature {temperature}:")
        print("Promedio VADER:", sum(finetuned_vader_scores) / len(finetuned_vader_scores))
        print("Promedio Sentence Transformer:", finetuned_sentence_transformer_score)


def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #parametros
    temperatures = (0.2)
    top_ks = (50)
    repetition_penalties = (0.5, 1.3, 1.9)
    ngram_sizes = (5, 10, 20)

    #Evaluo el modelo base
    evaluate_gpt2_base(test_df, device, 1, 50, 1.3, 5)

    # Evaluar modelos fine-tuneados
    evaluate_finetuned_models(test_df, temperatures, top_ks, repetition_penalties, ngram_sizes, device)

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/546M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/99086 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Evaluando Modelo Base...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Resultados del modelo base GPT-2:
Promedio VADER: 0.34514046454767744
Promedio Sentence Transformer: 0.2529532886781983
-------------
Evaluando Modelo Batch 7 with temperature 0.2, top k50, repetition penalty 0.5, ngram size 5...
Resultados del modelo fine-tuneado GPT-2 batch 7 with temperature 0.2:
Promedio VADER: 0.3416155256723718
Promedio Sentence Transformer: 0.4378565338259212
-------------
Evaluando Modelo Batch 7 with temperature 0.2, top k50, repetition penalty 0.5, ngram size 10...
Resultados del modelo fine-tuneado GPT-2 batch 7 with temperature 0.2:
Promedio VADER: 0.3180980440097806
Promedio Sentence Transformer: 0.42606166858581285
-------------
Evaluando Modelo Batch 7 with temperature 0.2, top k50, repetition penalty 0.5, ngram size 20...
Resultados del modelo fine-tuneado GPT-2 batch 7 with temperature 0.2:
Promedio VADER: 0.33676907090464575
Promedio Sentence Transformer: 0.4196543010124768
-------------
Evaluando Modelo Batch 7 with temperature 0.2, top k50, repetiti