## Installing libraries

In [1]:
!pip install "transformers==4.35" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken"
import transformers

print(transformers.__version__)

Collecting transformers==4.35
  Downloading transformers-4.35.0-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.1/123.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl.metadata (17 kB)
Collecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl.metadata (9.8 kB)
Collecting trl==0.4.7
  Downloading trl-0.4.7-py3-none-any.whl.metadata (10 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers==4.35)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.35)
  Downloading huggingface_hub-0.17.3

In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

2024-05-22 13:55:39.903504: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 13:55:39.903619: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 13:55:40.074857: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load dataset from local

In [3]:
# Dataset
from datasets import load_dataset
train_dataset = load_dataset("eli5_category",split="train[:50]")
validation_dataset = load_dataset("eli5_category",split="validation1[:5]")


Downloading builder script:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/62.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.00M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/91772 [00:00<?, ? examples/s]

Generating validation1 split:   0%|          | 0/5446 [00:00<?, ? examples/s]

Generating validation2 split:   0%|          | 0/2375 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5411 [00:00<?, ? examples/s]

In [4]:
def organize_dataset(data):
    questions = []
    answers_list = []
    scores_list = []

    for item in data:
        answers = item['answers']['text']
        scores = item['answers'].get('score', [])

        questions.append(item['title'])
        answers_list.append(answers)
        scores_list.append(scores)
    return {
        "questions": questions,
        "answers": answers_list,
        "scores": scores_list
    }

train_data = organize_dataset(train_dataset)
train_data = {key: value for key, value in train_data.items()}


val_data = organize_dataset(validation_dataset)
val_data = {key: value for key, value in val_data.items()}


# Join all answers

In [5]:
concatenate_train_answer = []
for answe_list in train_data['answers']:
    concatenate_train_answer.append(' '.join(answe_list))

concatenate_val_answer = []
for answe_list in val_data['answers']:
    concatenate_val_answer.append(' '.join(answe_list))

    
print(len(concatenate_train_answer))
print(len(concatenate_val_answer))

50
5


In [6]:
train_data['answers'] = concatenate_train_answer
train_data['answers'][0]

"the rotation of the earth is not a constant. in fact the rotation of the earth is slowing down, which means that a full day is getting slightly longer. without leap seconds our clocks would slowly drift ever so slightly out of sync with the actual day. we could deal with this by redefining how how long 1 second is, making it slightly longer so that one day is still exactly 24*60*60 seconds. but in practice that is really inconvenient for a lot of our technology which relies on very precise timing. its easier to just move us ahead one second every couple of years or so. The Earth's rotation is not regular. It varies a bit, so sometimes we add a second. We do this to ensure that noon is always going to be sometime around mid-day. If we did not add leap seconds, over a very long period of time where the Earth's rotation slowly changed, noon could end up being at dusk. We want to keep 7am in the morning, noon at mid-day, 7pm around evening, etc. Though we have never had one, it's also pos

In [7]:
val_data['answers'] = concatenate_val_answer
val_data['answers'][0]

"Whilst I'm no expert I listened to a very good segment on LBC recently (uk talkback radio) that explained that the men are more likely to commit such abuse but the percentages of women abusers are skewed lower due to several cultural issues such as: 1. Women are automatically perceived as peaceful nurturers where as men are aggressive physicals. 2. Children are less likely to call abuse against female family members as they are less likely to be believed. 3. Female abuse is (wrongly) perceived in many cases as being invited especially where the victim is slightly older i.e. Boys must like it right. This is the one cultural aspect I hate the most because a male teacher/person in trust (for example) is automatically in the wrong and a female teacher is more likely to go free / have no charges / have no abuse claim made. Don't get me wrong - male abusers are still the largest statistic but to deny there is an issue with female abuse harms the ability of our children to come forward. It i

In [8]:
train_data_df = pd.DataFrame(train_data)
val_data_df  = pd.DataFrame(val_data)

display(train_data_df.head(5))
display(val_data_df.head(5))

Unnamed: 0,questions,answers,scores
0,Why there was a 'leap second' added to the end...,the rotation of the earth is not a constant. i...,"[44, 5, 4]"
1,How do you claim undiscovered land?,Imagine you are out walking in the woods near ...,"[195, 39, 5]"
2,Why do we fail to do realistic human CGI (like...,It's more that we're really good at picking up...,"[34, 11, 7, 7]"
3,Why is it that we calm down when we take a dee...,Anxiety/stress are the result of your sympathe...,[8]
4,Why does 1080p on a 4k TV look better than 108...,In a 1080p screen each pixel is represented by...,"[9, 4]"


Unnamed: 0,questions,answers,scores
0,why is paedophilia so much more common in men ...,Whilst I'm no expert I listened to a very good...,"[22, 12, 6, 5, 4]"
1,Why is it okay to make fun of people who are f...,Honestly I don't know but as a southerner I fi...,[5]
2,"Why do we, as humans, crave social interaction...",Lots of people prefer to do things on their ow...,[3]
3,"What was Nietzche's philosophy, exactly?","Nietzsche's ""Ubermensch"" is the goal that soci...","[686, 172, 69, 35, 26, 10, 7, 7, 6, 3, 3]"
4,The Political Spectrum,The political spectrum varies widely from coun...,[11]


In [9]:
# Concatenate 'Question' and 'Answer' columns into a new column 'text'
train_data_df['text'] = 'Question:\n' + train_data_df['questions'] + '\n\nAnswer:\n' + train_data_df['answers']
val_data_df['text'] = 'Question:\n' + val_data_df['questions'] + '\n\nAnswer:\n' + val_data_df['answers']
train_data_df.drop(labels=['questions','answers','scores'],axis=1,inplace=True)
val_data_df.drop(labels=['questions','answers','scores'],axis=1,inplace=True)

In [10]:
display(train_data_df.head(5))
display(val_data_df.head(5))

Unnamed: 0,text
0,Question:\nWhy there was a 'leap second' added...
1,Question:\nHow do you claim undiscovered land?...
2,Question:\nWhy do we fail to do realistic huma...
3,Question:\nWhy is it that we calm down when we...
4,Question:\nWhy does 1080p on a 4k TV look bett...


Unnamed: 0,text
0,Question:\nwhy is paedophilia so much more com...
1,Question:\nWhy is it okay to make fun of peopl...
2,"Question:\nWhy do we, as humans, crave social ..."
3,"Question:\nWhat was Nietzche's philosophy, exa..."
4,Question:\nThe Political Spectrum\n\nAnswer:\n...


In [14]:
print(train_data_df['text'].loc[0])

Question:
Why there was a 'leap second' added to the end of 2016?

Answer:
the rotation of the earth is not a constant. in fact the rotation of the earth is slowing down, which means that a full day is getting slightly longer. without leap seconds our clocks would slowly drift ever so slightly out of sync with the actual day. we could deal with this by redefining how how long 1 second is, making it slightly longer so that one day is still exactly 24*60*60 seconds. but in practice that is really inconvenient for a lot of our technology which relies on very precise timing. its easier to just move us ahead one second every couple of years or so. The Earth's rotation is not regular. It varies a bit, so sometimes we add a second. We do this to ensure that noon is always going to be sometime around mid-day. If we did not add leap seconds, over a very long period of time where the Earth's rotation slowly changed, noon could end up being at dusk. We want to keep 7am in the morning, noon at mid

In [15]:
print(val_data_df['text'].loc[0])

Question:
why is paedophilia so much more common in men than women?

Answer:
Whilst I'm no expert I listened to a very good segment on LBC recently (uk talkback radio) that explained that the men are more likely to commit such abuse but the percentages of women abusers are skewed lower due to several cultural issues such as: 1. Women are automatically perceived as peaceful nurturers where as men are aggressive physicals. 2. Children are less likely to call abuse against female family members as they are less likely to be believed. 3. Female abuse is (wrongly) perceived in many cases as being invited especially where the victim is slightly older i.e. Boys must like it right. This is the one cultural aspect I hate the most because a male teacher/person in trust (for example) is automatically in the wrong and a female teacher is more likely to go free / have no charges / have no abuse claim made. Don't get me wrong - male abusers are still the largest statistic but to deny there is an iss

In [13]:
#convert to Huggingface Datasets format
train = Dataset.from_pandas(train_data_df)
validation = Dataset.from_pandas(val_data_df)
display(train)
display(validation)

Dataset({
    features: ['text'],
    num_rows: 50
})

Dataset({
    features: ['text'],
    num_rows: 5
})

# Fine-Tuning

In [16]:
model_id = "genaitraining/llama-2-7b-domain-tuned"

In [17]:
# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)


In [18]:
%%time
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading tokenizer_config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

CPU times: user 134 ms, sys: 16.9 ms, total: 151 ms
Wall time: 851 ms


In [19]:
%%time
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto")

Downloading config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

CPU times: user 23.2 s, sys: 22.6 s, total: 45.8 s
Wall time: 2min 26s


In [20]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
                          lora_alpha=16,
                          lora_dropout=0.1,
                          r=64,
                          bias="none",
                          task_type="CAUSAL_LM"
                        )

In [23]:
# Define the training arguments. For full list of arguments, check
#https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
args = TrainingArguments(
    output_dir='my_model',
    num_train_epochs=2, # adjust based on the data size
    per_device_train_batch_size=2, # use 4 if you have more GPU RAM
    save_strategy="steps", #steps
    save_steps=20,
    evaluation_strategy="steps",
    logging_steps=20,
    eval_steps = 20,
    prediction_loss_only=True,
    learning_rate=2e-4,
    fp16=True,
    seed=42,
    save_total_limit=2,
    load_best_model_at_end =True,
    #group_by_length= True
)

In [24]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=validation,
    dataset_text_field='text',
    peft_config=peft_config,
    
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)

In [25]:
!pip install -q wandb
import wandb
key_api = '541b559339e102d32d9c98eb3ad2918f7703c4b1'
wandb.login(key=key_api)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [26]:
# train
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mamirifard-mo[0m ([33mmohammadamirifard[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
20,2.4736,2.270272




TrainOutput(global_step=26, training_loss=2.4407525062561035, metrics={'train_runtime': 211.839, 'train_samples_per_second': 0.472, 'train_steps_per_second': 0.236, 'total_flos': 2075944729804800.0, 'train_loss': 2.4407525062561035, 'epoch': 1.52})

In [27]:
# save model in local
trainer.save_model()

# Clear Cashe

In [28]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()

0

In [29]:
torch.cuda.empty_cache()

In [30]:
gc.collect()

0

# Load Model from directory

In [31]:
%%time
from peft import AutoPeftModelForCausalLM

model_directory = '/kaggle/working/my_model'
new_model = AutoPeftModelForCausalLM.from_pretrained(
    model_directory,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 23.2 s, sys: 4.22 s, total: 27.4 s
Wall time: 26.6 s


# Test the model

## Test1

In [32]:
question_index_to_ask = 0


prompt = validation_dataset['title'][question_index_to_ask]
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = new_model.generate(input_ids=input_ids,
                         max_new_tokens=150,
                        #  do_sample=True,
                        #  top_p=0.9,
                         temperature=0.2)
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
print(result)



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


why is paedophilia so much more common in men than women?

It is not accurate to say that pedophilia is more common in men than women. According to the American Psychiatric Association, pedophilia is a mental disorder that affects both men and women, although the prevalence of pedophilia may be higher among men.

It is important to note that pedophilia is a rare disorder, and it is not accurate to make generalizations about the prevalence of pedophilia based on gender. The prevalence of pedophilia is not well understood, and there is ongoing research in this area.

It is also important to recognize that pedophilia is a complex and multifaceted disorder that is influenced by a


In [33]:
print('True Answer From dataset is: ',)
print('*'*100)
for index,answer in enumerate(validation_dataset['answers'][question_index_to_ask]['text']):
    print(f'True Answer {index}:\n {answer}')
    print('----'*4)

True Answer From dataset is: 
****************************************************************************************************
True Answer 0:
 Whilst I'm no expert I listened to a very good segment on LBC recently (uk talkback radio) that explained that the men are more likely to commit such abuse but the percentages of women abusers are skewed lower due to several cultural issues such as: 1. Women are automatically perceived as peaceful nurturers where as men are aggressive physicals. 2. Children are less likely to call abuse against female family members as they are less likely to be believed. 3. Female abuse is (wrongly) perceived in many cases as being invited especially where the victim is slightly older i.e. Boys must like it right. This is the one cultural aspect I hate the most because a male teacher/person in trust (for example) is automatically in the wrong and a female teacher is more likely to go free / have no charges / have no abuse claim made. Don't get me wrong - ma

## Test2

In [34]:
question_index_to_ask = 1


prompt = validation_dataset['title'][question_index_to_ask]
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = new_model.generate(input_ids=input_ids,
                         max_new_tokens=150,
                        #  do_sample=True,
                        #  top_p=0.9,
                         temperature=0.2)

result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
print(result)

Why is it okay to make fun of people who are from the southern United States?

It is not okay to make fun of people based on their geographical location. Making fun of someone's accent, culture, or background is a form of bullying and can be hurtful and disrespectful. Everyone deserves to be treated with dignity and respect, regardless of where they are from.

It is important to recognize that people from the southern United States have a unique culture and heritage that is worth celebrating. They have made significant contributions to American society, including in fields such as music, art, literature, and cuisine.

Rather than making fun of people from the south, we should strive to understand and appreciate their culture and traditions. We can


In [35]:
print('True Answer From dataset is: ',)
print('*'*100)
for index,answer in enumerate(validation_dataset['answers'][question_index_to_ask]['text']):
    print(f'True Answer {index}:\n {answer}')
    print('----'*4)

True Answer From dataset is: 
****************************************************************************************************
True Answer 0:
 Honestly I don't know but as a southerner I find most south jokes funny, except for a few that take it a tad to far.
----------------
