In [1]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import pandas as pd
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# List of Parquet files
files = [
    "data/test-00000-of-00001.parquet",
    "data/train-00000-of-00001.parquet",
    "data/validation-00000-of-00001.parquet"
]

# Read all files and concatenate them into a single DataFrame
df_combined = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

# Check the combined DataFrame
print(df_combined.shape)  # To check number of rows and columns
print(df_combined.head())  # Preview first few rows


(20424, 13)
                                            personas additional_context  \
0  [i hate talking to people., i believe dragons ...     Social anxiety   
1  [i have three daughters., my wife and i like t...                      
2      [i hate the taste of fish., i like to paint.]                      
3  [my favorite movie is good burger., i like can...                      
4         [my hair is black., i like rock climbing.]      Rock climbing   

                                  previous_utterance               context  \
0  [Wow, I am never shy. Do you have anxiety?, Ye...   wizard_of_wikipedia   
1  [My turtle ran away from me today., Oh my god....  empathetic_dialogues   
2  [Our son in the Army is taking a leave to visi...  empathetic_dialogues   
3  [that's awesome , i like running in the mornin...               convai2   
4  [Are there different skill levels? , I do not ...   wizard_of_wikipedia   

                                       free_messages  \
0  [and why 

In [3]:
df_combined.shape

(20424, 13)

In [4]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20424 entries, 0 to 20423
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   personas                   20424 non-null  object
 1   additional_context         20424 non-null  object
 2   previous_utterance         20424 non-null  object
 3   context                    20424 non-null  object
 4   free_messages              20424 non-null  object
 5   guided_messages            20424 non-null  object
 6   suggestions                20424 non-null  object
 7   guided_chosen_suggestions  20424 non-null  object
 8   label_candidates           20424 non-null  object
 9   template_name              20424 non-null  object
 10  template                   20424 non-null  object
 11  rendered_input             20424 non-null  object
 12  rendered_output            20424 non-null  object
dtypes: object(13)
memory usage: 2.0+ MB


In [5]:
# Proper train/validation split
from sklearn.model_selection import train_test_split
train_df, eval_df = train_test_split(df_combined, test_size=0.1, random_state=42)
print(f"Training set: {train_df.shape}, Validation set: {eval_df.shape}")


Training set: (18381, 13), Validation set: (2043, 13)


In [6]:
df_combined['previous_utterance'][0]

array(['Wow, I am never shy. Do you have anxiety?',
       "Yes. I end up sweating and blushing and feel like i'm going to throw up."],
      dtype=object)

In [7]:
# Format data in a way appropriate for conversational models
def format_dialog(row):
   
    user_input = row["previous_utterance"][0]  
    bot_response = row["previous_utterance"][1]  
    
    return {
        "text": f"<|endoftext|>User: {user_input}<|endoftext|>Bot: {bot_response}<|endoftext|>"
    }

In [8]:
# Process the data
train_data = [format_dialog(row) for _, row in train_df.iterrows()]
eval_data = [format_dialog(row) for _, row in eval_df.iterrows()]


In [9]:
# Create datasets
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

In [10]:
eval_dataset

Dataset({
    features: ['text'],
    num_rows: 2043
})

In [11]:
# Load model with 4-bit quantization for efficiency
model_name = "microsoft/DialoGPT-medium"

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [12]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [13]:
# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

In [14]:
# Prepare model for training
model = prepare_model_for_kbit_training(model)

In [15]:
# Configure LoRA
lora_config = LoraConfig(
    r=4,  
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "c_proj"]  
)


In [16]:
# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,081,344 || all params: 355,904,512 || trainable%: 0.3038


In [17]:
def tokenize_function(examples):
    # Tokenize the texts
    encodings = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    encodings["labels"] = encodings["input_ids"].clone()
    
    return encodings

In [18]:
# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map: 100%|██████████| 18381/18381 [00:01<00:00, 14183.98 examples/s]
Map: 100%|██████████| 2043/2043 [00:00<00:00, 13498.85 examples/s]


In [19]:
tokenized_eval

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2043
})

In [20]:
# Configure training - optimized for RTX 3050
training_args = TrainingArguments(
    output_dir="./dialogpt_lora_rtx3050",
    num_train_epochs=3,
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8, 
    evaluation_strategy="steps",
    eval_steps=1000,  
    save_strategy="steps",
    save_steps=1000,  
    save_total_limit=2,  
    logging_steps=200,
    learning_rate=1e-4, 
    weight_decay=0.01,
    fp16=True,
    optim="adamw_8bit",
    warmup_steps=200, 
    report_to="tensorboard",
    push_to_hub=False,
    gradient_checkpointing=False,
    # Memory optimization
    dataloader_pin_memory=False,  
    torch_compile=False,  
)



In [21]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [22]:
# Train model
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
1000,0.8874,0.861915
2000,0.8608,0.838297
3000,0.8538,0.830105


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=3444, training_loss=1.026591715220098, metrics={'train_runtime': 7422.882, 'train_samples_per_second': 7.429, 'train_steps_per_second': 0.464, 'total_flos': 1.2844203220402176e+16, 'train_loss': 1.026591715220098, 'epoch': 2.9992383853769993})

In [23]:
# Save final model
model.save_pretrained("./final_dialogpt_lora")
tokenizer.save_pretrained("./final_dialogpt_lora")



('./final_dialogpt_lora\\tokenizer_config.json',
 './final_dialogpt_lora\\special_tokens_map.json',
 './final_dialogpt_lora\\vocab.json',
 './final_dialogpt_lora\\merges.txt',
 './final_dialogpt_lora\\added_tokens.json',
 './final_dialogpt_lora\\tokenizer.json')