<a href="https://colab.research.google.com/github/sayid-alt/eleutherai-finetuned-nvidia-faq-llm/blob/main/training/training_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Working Space

In [None]:
# @title **Install Libraries**
!pip install transformers datasets accelerate -q

In [None]:
import pandas as pd
import requests
import zipfile
import os
import torch
import tensorflow as tf

from pprint import pprint
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
# from transformers import GPTNeoXForCausalLM, AutoTokenizer

from datasets import load_dataset
import logging
import torch
import wandb
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM



In [None]:

#@title **Load Pretrained Model**
pretrained_model = 'EleutherAI/pythia-1b'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
base_model = AutoModelForCausalLM.from_pretrained(pretrained_model)

dataset_hf_name = f"nvidia-faq-{pretrained_model.split('/')[0].lower()}-fine-tuned"

# @title Setup Training
model_finetuned_name = f"{pretrained_model.split('/')[0]}-{pretrained_model.split('/')[1]}-finetuned-nvidia-faq"
output_dir = model_finetuned_name

print(f'Finetuned Model Name: {model_finetuned_name}')
print(f'dataset hf Name: {dataset_hf_name}')

In [None]:
# @title **Logging To Hugging Face**
!pip install huggingface_hub

from huggingface_hub import notebook_login

# login to hugging face
notebook_login()

In [None]:
# @title **Load Data**
def load_nvidia_faq_data(url, zip_path='nvidia_faq.zip', extract_dir='nvidia_faq'):
    # Download the ZIP file from the URL
    response = requests.get(url)
    with open(zip_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {zip_path}")

    # Unzip the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Extracted to {extract_dir}")

    # Find the CSV file inside the extracted folder
    csv_files = [f for f in os.listdir(extract_dir) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError("No CSV file found in the extracted content.")

    # Load the first CSV file found
    csv_path = os.path.join(extract_dir, csv_files[0])
    data = pd.read_csv(csv_path)
    print(f"Loaded data from {csv_path}")

    return data

# URL to the Nvidia FAQ zip file (replace with the actual URL)
url = 'https://github.com/sayid-alt/eleutherai-finetuned-nvidia-faq-llm/raw/main/datasets/NvidiaDocumentationQandApairs.zip'

dataset = load_nvidia_faq_data(url)
dataset = dataset[['question', 'answer']]
display(dataset)

### **Data Preparation**

In [None]:
# @title **Preparing Finetuning Dataset**
# prompt template
prompt_template = """### Question:
{question}

### Answer:"""

# array for storing question answer data
finetuning_dataset = []
for i in range(len(dataset)):
  question = dataset.iloc[i]['question']
  answer = dataset.iloc[i]['answer']
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({
      "question": text_with_prompt_template,
      "answer": answer
  })

finetuning_dataset = Dataset.from_list(finetuning_dataset)
finetuning_dataset

In [None]:
sample_text = finetuning_dataset['question'][0] + finetuning_dataset['answer'][0]
sample_tokenized = tokenizer(sample_text, return_tensors='pt')
sample_tokenized['input_ids'][0]

In [None]:
# @title Tokenize Dataset

def tokenize_function(examples):
  text = examples["question"][0] + examples["answer"][0]

  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.truncation_side = 'left'
  tokenized_input = tokenizer(
      text,
      padding='max_length',
      truncation=True,
      max_length=512,
      return_tensors='pt'
  )

  return tokenized_input


# tokenize dataset
tokenized_dataset = finetuning_dataset.map(
    lambda x: tokenize_function(x),
    batched=True,
    batch_size=1,
    drop_last_batch=True,
    # remove_columns=['question', 'answer']
)

In [None]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [None]:
# @title Split Dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=25)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

train_dataset, test_dataset

In [None]:
# check if all size inputs are the same length
len(train_dataset['input_ids'][5]) == len(train_dataset['input_ids'][10])

In [None]:
example_encoded = train_dataset['input_ids'][0]
example_decoded = tokenizer.decode(example_encoded, skip_special_tokens=True)

print(example_encoded, '\n', example_decoded)

In [None]:
# pushing
split_dataset.push_to_hub(dataset_hf_name)

### Training Data

In [None]:
!pip install wandb -q

In [None]:
# # !wandb login

# # Login to wandb using kaggle notebook
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("wandb_api_key")

In [None]:
%env WANDB_PROJECT=eleutherai-nvidia-faq-fine-tuned
%env WANDB_WATCH=true
%env WANDB_LOG_MODEL=end

In [None]:
# @title Load Dataset
dataset_path_hf = f'paacamo/{dataset_hf_name}'
dataset = load_dataset(dataset_path_hf)

train_dataset = dataset['train'].map(remove_columns=(['question', 'answer'])) #use this for deleted some columns
test_dataset = dataset['test'].map(remove_columns=(['question', 'answer']))

train_dataset, test_dataset

In [None]:
!nvidia-smi

In [None]:
device_count = torch.cuda.device_count()
print(device_count)
if device_count > 0:
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

base_model.to(device)

In [None]:
training_args = TrainingArguments(
    # Learning Rate
    learning_rate=2e-5,

    remove_unused_columns=False,

    # Epochs
    num_train_epochs=2,

    # Batch Trainig Size
    per_device_train_batch_size=8,

    output_dir=output_dir,

    # max_steps=10,

    # Other arguments
    overwrite_output_dir=False, # Overwrite the content of the output directory
    disable_tqdm=False, # Disable progress bars
    eval_steps=100, # Number of update steps between two evaluations
    save_steps=100, # After # steps model is saved
    warmup_steps=1, # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=8, # Batch size for evaluation
    save_strategy='steps',
    eval_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim="adafactor",
    gradient_accumulation_steps = 1,
    gradient_checkpointing=False,

    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    push_to_hub=True,
    report_to='wandb',
    run_name=model_finetuned_name
)

In [None]:
# @title Trainer
from transformers import DataCollatorWithPadding

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
# @title Save model
save_dir = f'{output_dir}/final'
trainer.save_model(save_dir)
print(f'model saved to {save_dir}')

In [None]:
trainer.evaluate()

## **Inference & Evaluation**

In [None]:
# @title load Fine-Tuned Model
device = "cuda" if torch.cuda.is_available() else "cpu"
finetuned_model_name = f'paacamo/{output_dir}'

finetuned_model = AutoModelForCausalLM.from_pretrained(finetuned_model_name)
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_name)
tokenizer.pad_token = tokenizer.eos_token
finetuned_model.to(device)

In [None]:
def inference(text, model, tokenizer, max_input_token=1000, max_output_token=500):
  # Tokenize
  tokenizer.truncation_side = 'left'
  input_ids = tokenizer.encode(
      text,
      return_tensors='pt',
      padding=True,
      truncation=True,
      max_length=max_input_token
  )

  # generate
  device = model.device
  output_ids = finetuned_model.generate(
      input_ids=input_ids.to(device),
      max_length=max_output_token
  )

  # decode
  decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

   # Strip the prompt
  generated_text_answer = decoded_output[len(text):]
  return generated_text_answer

In [None]:
from tqdm import tqdm
from pprint import pprint

dataset_infer = load_dataset("paacamo/"+dataset_hf_name, split='test')

text = dataset_infer['question'][90]
answer = dataset_infer['answer'][90]

print(f'question: {text}')
predictions = {
    'answer': answer,
    'prediction': inference(text, finetuned_model, tokenizer)
}

pprint(predictions)

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model=finetuned_model_name)
pipe("what is the purpose of using CUDA rather than cpu?")

## Interface

In [None]:
!pip install gradio langchain langchain-core langchain langchain_huggingface langchain-community langchain_google_genai python-dotenv -q

In [None]:
# Write your code here
# Feel free to add new code block as needed

import gradio as gr
from transformers import pipeline

def chatbot(question):
    """
    This function takes a question as input and returns the chatbot's response.
    """
    pipe = pipeline('text-generation', model=finetuned_model_name)
    response = pipe(question)[0]['generated_text']
    return response


# Create the Gradio interface
iface = gr.Interface(
    fn=chatbot,
    inputs="text",
    outputs="text",
    title="Simple Chatbot with Langchain and Gradio",
    description="Ask me anything!",
)

# Launch the Gradio interface
iface.launch(debug=True)