In [None]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl jsonlines

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os
import torch
import jsonlines
from pprint import pprint
import datasets
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
# Model from Hugging Face hub
base_model = "microsoft/DialoGPT-small"

# New instruction dataset
bargains_dataset = "craigslist_bargains"

# Fine-tuned model
new_model = "llama-2-7b-chat-bargains"

# New Section

In [None]:
dataset = load_dataset(bargains_dataset, split="train")

In [None]:
def create_prompt_dict(data):
    # Extract data
    agent_info = data['agent_info']
    agent_turn = data['agent_turn']
    dialogue_acts = data['dialogue_acts']
    utterances = data['utterance']
    items = data['items']

    # Initialize variables to store the conversation
    conversation_prompt = []
    current_conversation = []

    # Iterate through the data and assign roles based on agent_turn
    for i in agent_turn:
        if agent_turn[i] == 0:
            current_conversation.append({
                agent_info['Role'][i]: utterances[i]
            })
        elif agent_turn[i] == 1:
            current_conversation.append({
                agent_info['Role'][i]: utterances[i]
            })

        # Check if the next turn is a new conversation
        if i == len(agent_turn) - 1 or agent_turn[i] != agent_turn[i + 1]:
            # Append the conversation to the conversation prompt
            conversation_prompt.append(current_conversation)
            current_conversation = []

    # Additional information
    agent_role = ['buyer' if turn == 0 else 'seller' for turn in agent_turn]
    agent_target = agent_info['Target']
    dialogue_intent = dialogue_acts['intent']
    dialogue_price = dialogue_acts['price']
    item_category = items['Category']
    item_images = items['Images']
    item_price = items['Price']
    item_description = items['Description']
    item_title = items['Title']

    # Construct the full prompt in dictionary format
    full_prompt_dict = {
        "Conversation": conversation_prompt,

        "Items": {
            "Title": item_title,
            "Category": item_category,
            "Price": item_price,
            "Target": agent_target,
            "Description": item_description,

            }
    }

    return full_prompt_dict



In [None]:
formatted_prompts = list(map(create_prompt_dict, dataset))

In [None]:
with jsonlines.open(f'bargains_processed.jsonl', 'w') as writer:
    writer.write_all(formatted_prompts)

In [None]:
pprint(formatted_prompts[0])

{'Conversation': [[{'buyer': 'Hi, not sure if the charger would work for my '
                             'car. Can you sell it to me for $5?'}],
                  [{'seller': 'It will work, i have never seen a car without a '
                              'cigarette lighter port.\\'}],
                  [{'buyer': 'Hi, not sure if the charger would work for my '
                             'car. Can you sell it to me for $5?'}],
                  [{'seller': 'It will work, i have never seen a car without a '
                              'cigarette lighter port.\\'}],
                  [{'buyer': 'Hi, not sure if the charger would work for my '
                             'car. Can you sell it to me for $5?'}],
                  [{'seller': 'It will work, i have never seen a car without a '
                              'cigarette lighter port.\\'}],
                  [{'buyer': 'Hi, not sure if the charger would work for my '
                             'car. Can you sell it to m

In [None]:


# Define the function to convert data to a string prompt
def convert_data_to_prompt(data):
    # Extract conversation and item details
    conversation = data['Conversation']
    items = data['Items']

    # Initialize the prompt as an empty string
    prompt = ""



    # Append item details to the prompt
    category = items['Category'][0]
    price = items['Price']
    description = items['Description'][0]
    title = items['Title'][0]
    target = items['Target']

    prompt += f"Category: {category}\n"
    prompt += f"Price: {price}\n"
    prompt += f"Description: {description}\n"
    prompt += f"Title: {title}\n"
    prompt += f"Target: {target}\n"
    # Append conversation to the prompt
    for turn in conversation:
        for dialogue in turn:
            role = list(dialogue.keys())[0]
            utterance = dialogue[role]
            if role == 'buyer':
                prompt += f"[INST] Buyer: {utterance} [/INST]\n"
            elif role == 'seller':
                prompt += f" Seller: {utterance}[\n"

    return prompt




In [None]:
prompts = list(map(convert_data_to_prompt, formatted_prompts))

In [None]:
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=prompts, columns=['Prompt']))

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=100,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="Prompt",
    peft_config=peft_params,
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params
)



Map:   0%|          | 0/5247 [00:00<?, ? examples/s]



In [None]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,17.2512
50,13.2261
75,9.4877
100,7.1802
125,6.7113
150,6.1097
175,6.0808
200,5.5216
225,5.6119
250,5.2171


In [None]:
trainer.save_model("bargains")

In [None]:
path = "/content/bargains"

In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(path, local_files_only=True)


In [None]:
#clear cache
#torch.cuda.empty_cache()