# Install

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U einops
!pip install -q -U safetensors
!pip install -q -U torch
!pip install -q -U xformers
!pip install -q -U datasets

In [None]:
!pip3 install datasets

In [None]:
from datasets import list_datasets, load_dataset

# List all available datasets on Hugging Face
all_datasets = list_datasets()

# Search for datasets related to aviation
aviation_datasets = [dataset for dataset in all_datasets if 'aviation' in dataset.lower()]

# Print the names of the aviation-related datasets
print(aviation_datasets)

  all_datasets = list_datasets()


['elihoole/asrs-aviation-reports', 'sakharamg/AviationQA', 'temesgen21/aviation_leases', 'temesgen21/aviation_agreements', 'meghtedari/AviationRadio']


In [None]:
# from datasets import load_dataset

# # Replace 'dataset_name' with the name of the dataset you want to load
# #dataset_name = 'sakharamg/AviationQA'
# dataset_name="truthful_qa"

# # Load the dataset
# dataset = load_dataset(dataset_name,'generation')

# # Print information about the dataset
# print(dataset)

In [None]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Import

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import transformers
import torch
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AutoTokenizer

# Load Quantized Model

In [None]:
model_id = "vilsonrodrigues/falcon-7b-instruct-sharded" # sharded model by vilsonrodrigues
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, trust_remote_code=True)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Prepare Data

In [None]:
from datasets import load_dataset

data = load_dataset("sakharamg/AviationQA")
 #("truthful_qa", "generation")

In [None]:
data

In [None]:

tokenizer.pad_token = tokenizer.eos_token

# prompt_template = "### Instruction: {prompt}\n### Response:"

# train_dataset = data['validation'].select(range(100)).map(lambda x: {"input_text": x['Question']  + "\n" + x['Answer']})
train_dataset = data['validation'].select(range(100)).map(lambda x: {"input_text": (x['Question'] or '') + "\n" + (x['Answer'] or '')})

# Tokenize the datasets
train_encodings = tokenizer(train_dataset['input_text'], truncation=True, padding=True, max_length=256, return_tensors='pt')

In [None]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [None]:
# Convert the encodings to PyTorch datasets
train_dataset = TextDataset(train_encodings)

# Example Before Fine Tuning

In [None]:
data['validation'][0]

In [None]:
def generate(index):

  # example_text = data['validation'][index]['question']
  # correct_answer = data['validation'][index]['best_answer']
  example_text = data['validation'][index]['Question']
  correct_answer = data['validation'][index]['Answer']

  print("Question:")
  print(example_text)

  encoding = tokenizer(example_text, return_tensors="pt").to("cuda:0")
  output = model.generate(input_ids=encoding.input_ids, attention_mask=encoding.attention_mask, max_new_tokens=100, do_sample=True, temperature=0.000001, eos_token_id=tokenizer.eos_token_id, top_k = 0)

  print("Answer:")
  print(tokenizer.decode(output[0], skip_special_tokens=True))

  print("Best Answer:")
  print(correct_answer)

  print()

In [None]:
generate(0)

# Training

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    args=transformers.TrainingArguments(
        num_train_epochs=30,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        warmup_ratio=0.05,
        # max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        lr_scheduler_type='cosine',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# Example After Fine Tuning

In [None]:
model.config.use_cache = True
model.eval()

In [None]:
generate(0)

# Merge Adapters and Save Model to Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

model.save_pretrained("/content/falcon-7b-instruct-ft-adapters-v2", push_to_hub=True) # save LORA adapters
model.push_to_hub("falcon-7b-instruct-ft-adapters-v2")

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [None]:
### Run this cell only (to avoid memory issues)
from transformers import AutoModelForCausalLM, PretrainedConfig
import torch

# reload the base model (you might need pro subscription for this)
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b-instruct", device_map={"":0}, trust_remote_code=True, torch_dtype=torch.float16)

Downloading (…)lve/main/config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

Downloading (…)/configuration_RW.py:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modelling_RW.py:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modelling_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
from peft import PeftModel

# load perf model with new adapters
model = PeftModel.from_pretrained(
    model,
    "Phoenixsymbol/falcon-7b-instruct-ft-adapters-v2"
    #vrsen/falcon-7b-instruct-ft-adapters",
)

Downloading (…)/adapter_config.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

In [None]:
model = model.merge_and_unload() # merge adapters with

In [None]:

model.push_to_hub("falcon-7b-instruct-ft-v2")

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Phoenixsymbol/falcon-7b-instruct-ft-v2/commit/227f6fd0b2078221b81c4821b96dafa44c14f541', commit_message='Upload RWForCausalLM', commit_description='', oid='227f6fd0b2078221b81c4821b96dafa44c14f541', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True)
tokenizer.push_to_hub("falcon-7b-instruct-ft-v2", use_auth_token=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/Phoenixsymbol/falcon-7b-instruct-ft-v2/commit/a7a587d1b119a50090388bb7de464de7ed2fc7d5', commit_message='Upload tokenizer', commit_description='', oid='a7a587d1b119a50090388bb7de464de7ed2fc7d5', pr_url=None, pr_revision=None, pr_num=None)