<a href="https://colab.research.google.com/github/shubha07m/LLM_Dialogue_Generation/blob/main/llm_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing and importing library

In [None]:
# Install the libraries

!pip install -U -q PyDrive
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m


In [None]:
# Exporting library
from google.colab import drive

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import (
    BlenderbotTokenizer,
    BlenderbotForConditionalGeneration,
    Trainer,
    TrainingArguments,
    GenerationConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)

# Loading the files and data preprocessing

In [None]:
# Function to read files directly from Google Drive

def download_and_read_file_from_drive(file_id, file_name):
    """
    Downloads a file from Google Drive using the file ID and reads it into a Pandas DataFrame if it's a CSV file.

    Args:
    file_id (str): The ID of the file in Google Drive.
    file_name (str): The name to save the file as (including extension).

    Returns:
    DataFrame: A Pandas DataFrame if the file is a CSV file, otherwise None.
    """
    try:

        # Authenticate and create the PyDrive client
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)

        # Create a GoogleDriveFile instance with the file ID
        downloaded = drive.CreateFile({'id': file_id})
        downloaded.GetContentFile(file_name)

        print(f'File {file_name} downloaded successfully.')

        # Check if the file is a CSV file and read it into a DataFrame
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
            print('CSV file read into DataFrame.')
            return df
        else:
            print('File is not a CSV. No DataFrame created.')
            return None

    except Exception as e:
        print(f'An error occurred: {e}')
        return None

In [None]:
# Reading first file

file_id = '18g5y5GmBQNgU8z2fPdushrdu0XfmQjph'
file_name = 'lee_cronin3.csv'

# file_id = '15EGbylkuobQtA0zXkeHmmhhNaIxoz50D'
# file_name = 'lee_cronin3.csv'
df1 = download_and_read_file_from_drive(file_id, file_name)

File lee_cronin3.csv downloaded successfully.
CSV file read into DataFrame.


In [None]:
# Reading the second file

file_id = '1Rm-ItCDv44iDqLaaEZTz-Cqu_xQPM6s5'
file_name = 'lisa_randall.csv'

# file_id = '1x3prg2ZD8h4PfOkd3Ftohyy8gtDPR3-v'
# file_name = 'lisa_randall.csv'
df2 = download_and_read_file_from_drive(file_id, file_name)

File lisa_randall.csv downloaded successfully.
CSV file read into DataFrame.


In [None]:
# Load Data
initial_data = df1
new_data =df2

In [None]:
# Prepare the input-output pairs for fine-tuning

data = []
for i in range(0, len(initial_data) - 1, 2):
    if initial_data.loc[i, 'speaker'] != initial_data.loc[i + 1, 'speaker']:
        data.append({
            'input': f"{initial_data.loc[i, 'speaker']}: {initial_data.loc[i, 'text']}",
            'output': f"{initial_data.loc[i + 1, 'speaker']}: {initial_data.loc[i + 1, 'text']}"
        })

print(type(data))
print(type(data[0]))
print(data[0])

<class 'list'>
<class 'dict'>
{'input': 'lee cronin:  every star in the sky probably has planets and life is probably emerging on these planets but i think the commentorial space associated with these planets is so different our causal cones are never going to overlap or not easily and this is the thing that makes me sad about alien life why we have to create alien life in the lab as quickly as possible because i dont know if we are going to be able to build architectures that will intersect with alien intelligence architectures', 'output': 'lex fridman:  intersect you dont mean in time or space'}


In [None]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(data)
df = df[['input', 'output']]
df.head()

Unnamed: 0,input,output
0,lee cronin: every star in the sky probably ha...,lex fridman: intersect you dont mean in time ...
1,lee cronin: time and the ability to communicate,lex fridman: the ability to communicate
2,lee cronin: yeah my biggest fear in a way is ...,lex fridman: the following is a conversation ...
3,lee cronin: thanks,lex fridman: it created i think its fair to s...
4,lee cronin: go for it,lex fridman: so assembly theory says that if ...


In [None]:
# Convert DataFrame to Hugging Face Dataset
train_val_df, test_df = train_test_split(df[['input', 'output']], test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

In [None]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Combine into DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
})

print(dataset_dict['test'][0])

{'input': 'lex fridman:  so youre talking about the factory', 'output': 'lee cronin:  yes this is really nice super important point is that when i talk about the universe having a memory or theres some magic its not that its that tells you that there must be a process encoded somewhere in physical reality be it a cell a tesla factory or something else that is making that object im not saying theres some kind of woowoo memory in the universe morphic resonance or something im saying that there is an actual causal process that is being directed constrained in some way so its not kind of just making everything', '__index_level_0__': 25}


In [None]:
# Drop the index column if present
dataset_dict = dataset_dict.remove_columns(['__index_level_0__'])

# Verify the columns
print(dataset_dict['test'][0])

{'input': 'lex fridman:  so youre talking about the factory', 'output': 'lee cronin:  yes this is really nice super important point is that when i talk about the universe having a memory or theres some magic its not that its that tells you that there must be a process encoded somewhere in physical reality be it a cell a tesla factory or something else that is making that object im not saying theres some kind of woowoo memory in the universe morphic resonance or something im saying that there is an actual causal process that is being directed constrained in some way so its not kind of just making everything'}


## Tokenization of data

In [None]:
# Load tokenizer and model
model_name = 'facebook/blenderbot-400M-distill'
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Defining tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding="max_length", truncation=True)
    outputs = tokenizer(examples['output'], padding="max_length", truncation=True)
    inputs['labels'] = outputs['input_ids']
    return inputs

In [None]:
# Tokenize datasets
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

print(type(tokenized_datasets))
# Verify the tokenized datasets
print("Sample from train dataset:", tokenized_datasets['train'][0])
print("Sample from validation dataset:", tokenized_datasets['val'][0])
print("Sample from test dataset:", tokenized_datasets['test'][0])

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

<class 'datasets.dataset_dict.DatasetDict'>
Sample from train dataset: {'input': 'lee cronin:   i think is the limit', 'output': 'lex fridman:  plus plus thats the', 'input_ids': [475, 76, 885, 273, 267, 33, 228, 228, 607, 507, 315, 271, 3548, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

## Training of LLM

In [None]:
# Defining training arguments

training_args = TrainingArguments(
    output_dir='./results',               # Directory where model checkpoints will be saved
    num_train_epochs=4,                   # Number of epochs
    per_device_train_batch_size=4,        # Batch size for training
    per_device_eval_batch_size=4,         # Batch size for evaluation
    warmup_steps=500,                     # Number of warmup steps
    weight_decay=0.01,                    # Weight decay
    logging_dir='./logs',                 # Directory for logs
    logging_steps=10,                     # Log every 10 steps
    eval_strategy='steps',          # Evaluate every `eval_steps` steps
    eval_steps=50,                        # Frequency of evaluation
    save_total_limit=3,                   # Limit the number of saved checkpoints
    load_best_model_at_end=True           # Load the best model at the end
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                        # The model to train
    args=training_args,                 # Training arguments
    train_dataset=tokenized_datasets['train'],  # Training dataset
    eval_dataset=tokenized_datasets['val'],     # Evaluation dataset
    tokenizer=tokenizer                 # Tokenizer
)

In [None]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
50,6.5028,5.319347
100,3.5381,3.005345
150,2.0229,1.580265
200,1.1831,1.18813


Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


TrainOutput(global_step=244, training_loss=3.9879881612590102, metrics={'train_runtime': 150.5128, 'train_samples_per_second': 6.405, 'train_steps_per_second': 1.621, 'total_flos': 262250920673280.0, 'train_loss': 3.9879881612590102, 'epoch': 4.0})

## Save tuned model and tokenizer

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the path to save the model
save_path = '/content/drive/MyDrive/blenderbot_llm'

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")

Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to /content/drive/MyDrive/blenderbot_llm


In [None]:
# Define your custom generation parameters
gen_config = GenerationConfig(
    max_length=60,
    min_length=20,
    num_beams=10,
    length_penalty=0.65,
    no_repeat_ngram_size=3,
    encoder_no_repeat_ngram_size=3,
    forced_eos_token_id=2
)
# Save the GenerationConfig
gen_config.save_pretrained(save_path)