In [None]:
!pip install -r requirements.txt



In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup



In [None]:
import pandas as pd
class AppointmentDataset(Dataset):
    def __init__(self, file_path):
        self.data = pd.read_csv(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            "input": item['command'],
            "description": item['description'],
            "output": item['response']
        }

In [None]:
# Initialize the tokenizer and model
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#Manually set the pad token
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
# Tokenize the data
def tokenize_data(data, tokenizer, max_length=512):
    inputs = tokenizer(data['input'], return_tensors='pt', max_length=max_length, truncation=True, padding="max_length")
    outputs = tokenizer(data['output'], return_tensors='pt', max_length=max_length, truncation=True, padding="max_length")
    return {'input_ids': inputs['input_ids'].flatten(), 'labels': outputs['input_ids'].flatten()}


In [None]:
# Create the dataset and dataloader
dataset = AppointmentDataset('dataframe.csv')  # This is where file_path is used
tokenized_data = [tokenize_data(item, tokenizer) for item in dataset]
dataloader = DataLoader(tokenized_data, batch_size=2, shuffle=True)


In [None]:
# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(dataloader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [None]:

for epoch in range(3):  # Training for 3 epochs
    model.train()
    for batch in dataloader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')


Epoch 1, Loss: 0.15182027220726013
Epoch 2, Loss: 0.11591552197933197
Epoch 3, Loss: 0.20364032685756683


('fine-tuned-gpt2/tokenizer_config.json',
 'fine-tuned-gpt2/special_tokens_map.json',
 'fine-tuned-gpt2/vocab.json',
 'fine-tuned-gpt2/merges.txt',
 'fine-tuned-gpt2/added_tokens.json')

In [None]:

from huggingface_hub import HfFolder, Repository, notebook_login

notebook_login()

In [None]:
model_name = 'reginald160/gpt2-Ozougwu-Model-B2'

model.push_to_hub(model_name, use_auth_token=True)
tokenizer.push_to_hub(model_name, use_auth_token=True)
