<a href="https://colab.research.google.com/github/shubha07m/LLM_Dialogue_Generation/blob/main/EWC_llm_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing and importing library

In [None]:
# Install the libraries

!pip install -U -q PyDrive
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m


In [None]:
# Exporting library
from google.colab import drive
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import (
    BlenderbotTokenizer,
    BlenderbotForConditionalGeneration,
)
import pickle
import torch
from torch import nn, optim
import torch.nn.functional as F

# Loading the files and data preprocessing

In [None]:
# Function to read files directly from Google Drive

def download_and_read_file_from_drive(file_id, file_name):
    """
    Downloads a file from Google Drive using the file ID and reads it into a Pandas DataFrame if it's a CSV file.

    Args:
    file_id (str): The ID of the file in Google Drive.
    file_name (str): The name to save the file as (including extension).

    Returns:
    DataFrame: A Pandas DataFrame if the file is a CSV file, otherwise None.
    """
    try:

        # Authenticate and create the PyDrive client
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)

        # Create a GoogleDriveFile instance with the file ID
        downloaded = drive.CreateFile({'id': file_id})
        downloaded.GetContentFile(file_name)

        print(f'File {file_name} downloaded successfully.')

        # Check if the file is a CSV file and read it into a DataFrame
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
            print('CSV file read into DataFrame.')
            return df
        else:
            print('File is not a CSV. No DataFrame created.')
            return None

    except Exception as e:
        print(f'An error occurred: {e}')
        return None

### Pre processing intial dataset (Lex Freedman - Lee Cornin)

In [None]:
# Reading first file

file_id = '18g5y5GmBQNgU8z2fPdushrdu0XfmQjph'
file_name = 'lee_cronin3.csv'

# file_id = '15EGbylkuobQtA0zXkeHmmhhNaIxoz50D'
# file_name = 'lee_cronin3.csv'
df1 = download_and_read_file_from_drive(file_id, file_name)

# Load Data
initial_data = df1

File lee_cronin3.csv downloaded successfully.
CSV file read into DataFrame.


In [None]:
# Prepare the input-output pairs for fine-tuning

data = []
for i in range(0, len(initial_data) - 1, 2):
    if initial_data.loc[i, 'speaker'] != initial_data.loc[i + 1, 'speaker']:
        data.append({
            'input': f"{initial_data.loc[i, 'speaker']}: {initial_data.loc[i, 'text']}",
            'output': f"{initial_data.loc[i + 1, 'speaker']}: {initial_data.loc[i + 1, 'text']}"
        })

print(type(data))
print(type(data[0]))
print(data[0])

<class 'list'>
<class 'dict'>
{'input': 'lee cronin:  every star in the sky probably has planets and life is probably emerging on these planets but i think the commentorial space associated with these planets is so different our causal cones are never going to overlap or not easily and this is the thing that makes me sad about alien life why we have to create alien life in the lab as quickly as possible because i dont know if we are going to be able to build architectures that will intersect with alien intelligence architectures', 'output': 'lex fridman:  intersect you dont mean in time or space'}


In [None]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(data)
df = df[['input', 'output']]
df.head()

Unnamed: 0,input,output
0,lee cronin: every star in the sky probably ha...,lex fridman: intersect you dont mean in time ...
1,lee cronin: time and the ability to communicate,lex fridman: the ability to communicate
2,lee cronin: yeah my biggest fear in a way is ...,lex fridman: the following is a conversation ...
3,lee cronin: thanks,lex fridman: it created i think its fair to s...
4,lee cronin: go for it,lex fridman: so assembly theory says that if ...


In [None]:
# Convert DataFrame to Hugging Face Dataset
train_val_df, test_df = train_test_split(df[['input', 'output']], test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

In [None]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Combine into DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
})

print(dataset_dict['test'][0])

{'input': 'lex fridman:  so youre talking about the factory', 'output': 'lee cronin:  yes this is really nice super important point is that when i talk about the universe having a memory or theres some magic its not that its that tells you that there must be a process encoded somewhere in physical reality be it a cell a tesla factory or something else that is making that object im not saying theres some kind of woowoo memory in the universe morphic resonance or something im saying that there is an actual causal process that is being directed constrained in some way so its not kind of just making everything', '__index_level_0__': 25}


In [None]:
# Drop the index column if present
dataset_dict = dataset_dict.remove_columns(['__index_level_0__'])

# Verify the columns
print(dataset_dict['test'][0])

{'input': 'lex fridman:  so youre talking about the factory', 'output': 'lee cronin:  yes this is really nice super important point is that when i talk about the universe having a memory or theres some magic its not that its that tells you that there must be a process encoded somewhere in physical reality be it a cell a tesla factory or something else that is making that object im not saying theres some kind of woowoo memory in the universe morphic resonance or something im saying that there is an actual causal process that is being directed constrained in some way so its not kind of just making everything'}


In [None]:
# Load tokenizer and model
model_name = 'facebook/blenderbot-400M-distill'
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/310k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

In [None]:
# Defining tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding="max_length", truncation=True)
    outputs = tokenizer(examples['output'], padding="max_length", truncation=True)
    inputs['labels'] = outputs['input_ids']
    return inputs

In [None]:
# Tokenize datasets
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

print(type(tokenized_datasets))
# Verify the tokenized datasets
print("Sample from train dataset:", tokenized_datasets['train'][0])
print("Sample from validation dataset:", tokenized_datasets['val'][0])
print("Sample from test dataset:", tokenized_datasets['test'][0])

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

<class 'datasets.dataset_dict.DatasetDict'>
Sample from train dataset: {'input': 'lee cronin:   i think is the limit', 'output': 'lex fridman:  plus plus thats the', 'input_ids': [475, 76, 885, 273, 267, 33, 228, 228, 607, 507, 315, 271, 3548, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

In [None]:
# converting initial data to dataloader format

from torch.utils.data import DataLoader

# Assuming 'train', 'val', and 'test' are keys in tokenized_datasets
train_dataset = tokenized_datasets['train']
val_dataset = tokenized_datasets['val']
test_dataset = tokenized_datasets['test']

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# saving old_dataloader for using later
old_dataloader = train_dataloader

In [None]:
# Verify the old dataloader format
for batch in old_dataloader:
    print(batch)
    break

{'input': ['lex fridman:  well that would mean its deterministic right', 'lee cronin:  now lets say that molecule takes  steps and it is using a finite set of atoms now lets say another molecule smart ass molecule well call it comes in and can survive in that environment and can copy itself but it only needs five steps the molecule that only needs five steps continued both molecules are being destroyed but theyre creating themselves faster they can be destroyed you can see that the shortest path reigns supreme so the shortest path tells us something super interesting about the minimal amount of information required to propagate that motif in time and space and it seems to be like some kind of conservation law', 'lex fridman: time and free will', 'lee cronin:  well the dog is sentient', 'lee cronin:  so we submitted the paper and then when it was almost accepted the mass spec one and it was astrobiologists said great a mass spectroscopist said great and the chemist went nonsense biggest

### Pre processing new dataset (Lex Freedman - Lisa randal)

In [None]:
# Reading the second file

file_id = '1Rm-ItCDv44iDqLaaEZTz-Cqu_xQPM6s5'
file_name = 'lisa_randall.csv'

# file_id = '1x3prg2ZD8h4PfOkd3Ftohyy8gtDPR3-v'
# file_name = 'lisa_randall.csv'
df2 = download_and_read_file_from_drive(file_id, file_name)
new_data =df2

File lisa_randall.csv downloaded successfully.
CSV file read into DataFrame.


In [None]:
# Prepare the input-output pairs for second fine-tuning

data = []
for i in range(0, len(new_data) - 1, 2):
    if new_data.loc[i, 'speaker'] != new_data.loc[i + 1, 'speaker']:
        data.append({
            'input': f"{new_data.loc[i, 'speaker']}: {new_data.loc[i, 'text']}",
            'output': f"{new_data.loc[i + 1, 'speaker']}: {new_data.loc[i + 1, 'text']}"
        })

print(type(data))
print(type(data[0]))
print(data[0])

<class 'list'>
<class 'dict'>
{'input': 'lex fridman:  in theory it behaves just like any other matter it just doesnt interact with light', 'output': 'lisa randall:  when we say it interacts just like any other form of matter we have to be careful because gravitationally it interacts like other forms of matter but it doesnt experience electromagnetism which is why it has a different distribution'}


In [None]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(data)
df = df[['input', 'output']]
df.head()

Unnamed: 0,input,output
0,lex fridman: in theory it behaves just like a...,lisa randall: when we say it interacts just l...
1,lisa randall: theres also just more of it and...,lex fridman: its part of the story of the ori...
2,lisa randall: exactly in my book i make jokes...,lex fridman: thats a metaphor on top of a met...
3,lisa randall: exactly no but it is a metaphor...,lex fridman: yeah but the things we cannot se...
4,lex fridman: yeah but a lot of our intuition ...,lisa randall: thats absolutely true certainly...


In [None]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Combine into DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
})

print(datasets['test'][0])

{'input': 'lex fridman:  so youre talking about the factory', 'output': 'lee cronin:  yes this is really nice super important point is that when i talk about the universe having a memory or theres some magic its not that its that tells you that there must be a process encoded somewhere in physical reality be it a cell a tesla factory or something else that is making that object im not saying theres some kind of woowoo memory in the universe morphic resonance or something im saying that there is an actual causal process that is being directed constrained in some way so its not kind of just making everything', '__index_level_0__': 25}


In [None]:
# Drop the index column if present
datasets = datasets.remove_columns(['__index_level_0__'])

# Verify the columns
print(datasets['test'][0])

{'input': 'lex fridman:  so youre talking about the factory', 'output': 'lee cronin:  yes this is really nice super important point is that when i talk about the universe having a memory or theres some magic its not that its that tells you that there must be a process encoded somewhere in physical reality be it a cell a tesla factory or something else that is making that object im not saying theres some kind of woowoo memory in the universe morphic resonance or something im saying that there is an actual causal process that is being directed constrained in some way so its not kind of just making everything'}


In [None]:
# Tokenize datasets
tokenized_new_dataset = datasets.map(tokenize_function, batched=True)

print(type(tokenized_datasets))
# Verify the tokenized datasets
print("Sample from train dataset:", tokenized_new_dataset['train'][0])
print("Sample from validation dataset:", tokenized_new_dataset['val'][0])
print("Sample from test dataset:", tokenized_new_dataset['test'][0])

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

<class 'datasets.dataset_dict.DatasetDict'>
Sample from train dataset: {'input': 'lee cronin:   i think is the limit', 'output': 'lex fridman:  plus plus thats the', 'input_ids': [475, 76, 885, 273, 267, 33, 228, 228, 607, 507, 315, 271, 3548, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

In [None]:
# converting new data to dataloader format

# Assuming 'train', 'val', and 'test' are keys in tokenized_datasets
train_dataset = tokenized_new_dataset['train']
val_dataset = tokenized_new_dataset['val']
test_dataset = tokenized_new_dataset['test']

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# saving new_dataloader for using later
new_dataloader = train_dataloader

In [None]:
# Verify the new dataloader format
for batch in new_dataloader:
    print(batch)
    break

{'input': ['lex fridman:  just the crazy part', 'lex fridman:  yes', 'lee cronin:  im happy with both depending on the day', 'lee cronin:  i think they do in argentina and they do in somalia and they do in a lot of these places where  no i think this is a great idea im a strong advocate now for  so what have we come up with burning all the nuclear material to have energy and before we do that because mad is good mutually assured destruction is very powerful lets take it into the metaverse and then get people to kind of subscribe to that and if they actually nuke each other even for fun in the metaverse there are dire consequences', 'lee cronin:  because it just basically this is a case where interpolation extrapolation worked relatively well and we were able to generate the really good molecules and then what we were able to do here is and this is a really good point and what i was trying to say earlier that we were able to generate new molecules from the known set that would bind to t

#### Saving both the dataloaders

In [None]:
# Function to save dataloader data
def save_dataloader_data(dataloader, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataloader, f)

# Save old and new dataloader data locally
save_dataloader_data(old_dataloader, 'old_dataloader.pkl')
save_dataloader_data(new_dataloader, 'new_dataloader.pkl')

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Create the folder if it doesn't exist
!mkdir -p /content/drive/MyDrive/saved_dataloader_data

# Copy the files to Google Drive
!cp old_dataloader.pkl /content/drive/MyDrive/saved_dataloader_data/old_dataloader.pkl
!cp new_dataloader.pkl /content/drive/MyDrive/saved_dataloader_data/new_dataloader.pkl

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Fisher information collection from old data

In [None]:
from google.colab import drive
import pickle

# Mount Google Drive
drive.mount('/content/drive')

# Load the old dataloader
with open('/content/drive/MyDrive/saved_dataloader_data/old_dataloader.pkl', 'rb') as f:
    old_dataloader = pickle.load(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Fetch one batch from the dataloader to inspect its structure
print(type(old_dataloader))
for batch in old_dataloader:
    print(batch)
    break

<class 'torch.utils.data.dataloader.DataLoader'>
{'input': ['lee cronin:  you flip the problem  and focus on the molecule evolution rather than the protein and so you can guess in the future what might happen so you rather than having to consider all possible molecules you know where to focus and thats the same thing if youre looking at in assembly spaces for an object where you dont know the entire history but you know that in the history of this object its not going to have some other motif there that it doesnt apply it doesnt appear in the past', 'lex fridman:  well clippy is definitely coming back but youre saying we dont have a great understanding of what is intelligence what is the intelligence underpinning the human mind', 'lee cronin:  yeah youre welcome would have both infrared and mass spec it would have two ports so it could shine a light and so what it would do is you would have a vacuum chamber and you would have an electrostatic analyzer and youd have a monochromator to p

In [None]:
# Function for collecting fisher information

def fisher_information_collection(model, dataloader, criterion):
    model.eval()
    fisher_information = {n: torch.zeros(p.shape).to(p.device) for n, p in model.named_parameters() if p.requires_grad}

    for batch in dataloader:
        input_ids = torch.stack(batch['input_ids']).to(model.device)
        attention_mask = torch.stack(batch['attention_mask']).to(model.device)
        labels = torch.stack(batch['labels']).to(model.device)

        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits.view(-1, model.config.vocab_size), labels.view(-1))
        loss.backward()

        for n, p in model.named_parameters():
            if p.grad is not None:
                fisher_information[n] += (p.grad ** 2) / len(dataloader)

    return fisher_information

In [None]:
# Load fine tuned tokenizer and model - tuned on old data
tokenizer = BlenderbotTokenizer.from_pretrained('/content/drive/My Drive/blenderbot_llm')
model = BlenderbotForConditionalGeneration.from_pretrained('/content/drive/My Drive/blenderbot_llm')

In [None]:
# Define your criterion
criterion = torch.nn.CrossEntropyLoss()

# Collect Fisher information using the old dataloader
fisher_information = fisher_information_collection(model, old_dataloader, criterion)

In [None]:
# Define the path where you want to save the Fisher information
fisher_info_path = '/content/drive/My Drive/saved_dataloader_data/fisher_information.pt'

# Save the Fisher information
torch.save(fisher_information, fisher_info_path)


In [None]:
# Save model parameters
old_params = {name: param.clone().detach() for name, param in model.named_parameters() if param.requires_grad}

# Fine-Tune first tuned model with EWC on New Data

In [None]:
from google.colab import drive
import pickle

# Mount Google Drive
drive.mount('/content/drive')

# Load the tokenizer and model
tokenizer = BlenderbotTokenizer.from_pretrained('/content/drive/My Drive/blenderbot_llm')
model = BlenderbotForConditionalGeneration.from_pretrained('/content/drive/My Drive/blenderbot_llm')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Mounted at /content/drive


BlenderbotForConditionalGeneration(
  (model): BlenderbotModel(
    (shared): BlenderbotScaledWordEmbedding(8008, 1280, padding_idx=0)
    (encoder): BlenderbotEncoder(
      (embed_tokens): BlenderbotScaledWordEmbedding(8008, 1280, padding_idx=0)
      (embed_positions): BlenderbotLearnedPositionalEmbedding(128, 1280)
      (layers): ModuleList(
        (0-1): 2 x BlenderbotEncoderLayer(
          (self_attn): BlenderbotAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5

In [None]:
# Load the new dataloader
with open('/content/drive/MyDrive/saved_dataloader_data/new_dataloader.pkl', 'rb') as f:
    new_dataloader = pickle.load(f)

In [None]:
# Define the path where you saved the Fisher information
fisher_info_path = '/content/drive/My Drive/saved_dataloader_data/fisher_information.pt'

# Load the Fisher information
fisher_information = torch.load(fisher_info_path)

In [None]:
# Ensure Fisher information is in the correct format
for name, param in model.named_parameters():
    if name not in fisher_information:
        raise ValueError(f"Fisher information for parameter '{name}' is missing.")

In [None]:
class EWC:
    def __init__(self, model, old_params, fisher_info, lambda_, device):
        self.model = model
        self.old_params = {name: param.to(device) for name, param in old_params.items()}
        self.fisher_info = {name: info.to(device) for name, info in fisher_info.items()}
        self.lambda_ = lambda_
        self.device = device

    def compute_ewc_loss(self):
        ewc_loss = 0
        for name, param in self.model.named_parameters():
            if name in self.old_params:
                # Compute the EWC loss
                ewc_loss += (self.fisher_info[name] * (param - self.old_params[name]).pow(2)).sum()
        return self.lambda_ * ewc_loss

In [None]:
# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load old model parameters for EWC and move them to the correct device
old_params = {name: param.clone().to(device) for name, param in model.named_parameters()}

# Define Fisher information and lambda, and move Fisher information to the correct device
fisher_info = {name: info.to(device) for name, info in fisher_information.items()}
lambda_ = 0.4  # EWC regularization strength

# Initialize EWC instance
ewc = EWC(model, old_params, fisher_info, lambda_, device)

### Training using EWC with new data on initial tuned model

In [None]:
import torch
import torch.optim as optim
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration

# Load the tokenizer and model
tokenizer = BlenderbotTokenizer.from_pretrained('/content/drive/My Drive/blenderbot_llm')
model = BlenderbotForConditionalGeneration.from_pretrained('/content/drive/My Drive/blenderbot_llm')

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Assuming you have already loaded new_dataloader

# Load old model parameters for EWC and move them to the correct device
old_params = {name: param.clone().to(device) for name, param in model.named_parameters()}

# Define Fisher information and lambda, and move Fisher information to the correct device
fisher_info = {name: info.to(device) for name, info in fisher_information.items()}
lambda_ = 0.4  # EWC regularization strength

# Initialize EWC instance
ewc = EWC(model, old_params, fisher_info, lambda_, device)

# Set training parameters
epochs = 3
learning_rate = 5e-5

# Define optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in new_dataloader:
        # Get inputs and labels from the batch
        input_ids = torch.stack([x.to(device) for x in batch['input_ids']])
        attention_mask = torch.stack([x.to(device) for x in batch['attention_mask']])
        labels = torch.stack([x.to(device) for x in batch['labels']])

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss.to(device)

        # Add EWC regularization term
        ewc_loss = ewc.compute_ewc_loss()
        total_loss = loss + ewc_loss

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        print(f"Loss: {total_loss.item():.4f}")

Epoch 1/3
Loss: 14.5261
Loss: 10.1535
Loss: 9.2233
Loss: 8.9000
Loss: 5.1933
Loss: 3.1601
Loss: 3.3265
Loss: 3.4484
Loss: 3.5812
Loss: 2.0255
Loss: 3.4968
Loss: 3.7526
Loss: 2.6690
Loss: 2.6369
Loss: 2.6815
Loss: 3.5731
Loss: 4.5114
Loss: 4.6795
Loss: 3.2021
Loss: 4.8342
Loss: 3.3312
Loss: 2.2203
Loss: 3.1403
Loss: 4.0622
Loss: 3.2360
Loss: 2.4218
Loss: 3.0504
Loss: 1.5653
Loss: 2.7731
Loss: 4.3854
Loss: 0.7158
Epoch 2/3
Loss: 2.2792
Loss: 2.8175
Loss: 2.8188
Loss: 2.8959
Loss: 2.8877
Loss: 2.5271
Loss: 2.2641
Loss: 1.8412
Loss: 1.9007
Loss: 5.0412
Loss: 2.6340
Loss: 2.4133
Loss: 4.3319
Loss: 3.3543
Loss: 4.1658
Loss: 2.4745
Loss: 2.6311
Loss: 2.2131
Loss: 2.8837
Loss: 1.6431
Loss: 2.5884
Loss: 2.3630
Loss: 2.5550
Loss: 2.5067
Loss: 2.9149
Loss: 2.8717
Loss: 2.9795
Loss: 2.9603
Loss: 2.2472
Loss: 2.7782
Loss: 0.6364
Epoch 3/3
Loss: 1.8377
Loss: 3.4156
Loss: 2.1797
Loss: 3.3539
Loss: 2.6563
Loss: 2.7531
Loss: 2.7573
Loss: 2.7138
Loss: 4.3491
Loss: 3.6053
Loss: 2.6442
Loss: 2.7184
Loss: 

In [None]:

# Save the fine-tuned model
model_save_path = '/content/drive/My Drive/ewc_trained_llm'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

Non-default generation parameters: {'max_length': 60, 'min_length': 20, 'num_beams': 10, 'length_penalty': 0.65, 'no_repeat_ngram_size': 3, 'encoder_no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


('/content/drive/My Drive/ewc_trained_llm/tokenizer_config.json',
 '/content/drive/My Drive/ewc_trained_llm/special_tokens_map.json',
 '/content/drive/My Drive/ewc_trained_llm/vocab.json',
 '/content/drive/My Drive/ewc_trained_llm/merges.txt',
 '/content/drive/My Drive/ewc_trained_llm/added_tokens.json')