# Installing Packages

In [2]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/
!pip install -q -U transformers accelerate optimum
!pip install peft https://github.com/huggingface/peft.git@9f7492577ff91c51077308f98dade45bf32c268a
!pip install loralib==0.1.1
!pip install optimum
!pip install autogptq
!pip install -q datasets bitsandbytes einops trl

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting https://github.com/huggingface/peft.git@9f7492577ff91c51077308f98dade45bf32c268a
[31m  ERROR: HTTP error 404 while getting https://github.com/huggingface/peft.git@9f7492577ff91c51077308f98dade45bf32c268a[0m[31m
[0m[31mERROR: Could not install requirement https://github.com/huggingface/peft.git@9f7492577ff91c51077308f98dade45bf32c268a because of HTTP error 404 Client Error: Not Found for url: https://github.com/huggingface/peft.git@9f7492577ff91c51077308f98dade45bf32c268a for URL https://github.com/huggingface/peft.git@9f7492577ff91c51077308f98dade45bf32c268a[0m[31m
[31mERROR: Could not find a version that satisfies the requirement autogptq (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for autogptq[0m[31m
[0m

# Import Modules

In [3]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
import torch
import optimum
import peft
import accelerate
import auto_gptq

# Model Selection and Quantizing

In [4]:
# Defining the name of the Falcon model
model_name = "ybelkada/falcon-7b-sharded-bf16"

# Configuring the BitsAndBytes quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)

# Loading the Falcon model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True
)

# Disabling cache usage in the model configuration
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

# Preparing Data

In [5]:
data1 = load_dataset("ttbui/html_alpaca", split="train")

data = data1.remove_columns(['input','response'])
print(data)


Dataset({
    features: ['instruction', 'output'],
    num_rows: 636
})


In [6]:

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
def tokenize_function(examples):
    if "instruction" in examples and "output" in examples:
      text = examples["instruction"][0] + examples["output"][0]


    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [7]:
finetuning_dataset_loaded = load_dataset("ttbui/html_alpaca",split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

final_tokenized_dataset = tokenized_dataset.remove_columns(['input','response'])
print(final_tokenized_dataset)

Dataset({
    features: ['response', 'input', 'instruction', 'output', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 636
})
Dataset({
    features: ['instruction', 'output', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 636
})


In [8]:
#splitting into Testing and training sets
split_dataset = final_tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 508
    })
    test: Dataset({
        features: ['instruction', 'output', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 128
    })
})


# Training and Fine Tuning

In [8]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
model = prepare_model_for_kbit_training(model)

lora_alpha = 16
lora_dropout = 0.1
lora_rank = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model, peft_config)

In [11]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
training_args = TrainingArguments(
    gradient_accumulation_steps=4,

    per_device_train_batch_size=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=4,
    logging_steps=25,
    output_dir="output_dir",
    save_strategy='epoch',
    optim="paged_adamw_8bit",
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.05,
)

trainer = Trainer(
    model=peft_model,
    train_dataset=split_dataset["train"],

    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [12]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,0.7931
50,0.6518
75,0.5846
100,0.6144
125,0.5987
150,0.4853
175,0.4649
200,0.4495
225,0.4559
250,0.5043


TrainOutput(global_step=381, training_loss=0.4929381017609844, metrics={'train_runtime': 672.2597, 'train_samples_per_second': 2.267, 'train_steps_per_second': 0.567, 'total_flos': 1.197506199212928e+16, 'train_loss': 0.4929381017609844, 'epoch': 3.0})

# Evaluation

In [26]:
#Saving the Fine-tuned model
from transformers import AutoModelForCausalLM
output_dir = "/content/output_dir"


final_model = AutoModelForCausalLM.from_pretrained(
output_dir, local_files_only=True,
quantization_config=bnb_config,
trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [12]:
print(final_model)

FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (rotary_emb): FalconRotaryEmbedding()
          (query_key_value): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4544, out_features=4672, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4544, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4672, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (dense): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4544, out_features=4544, bias=False)
            (lora_dropout): M

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()

# List all files in the current directory
files_in_directory = os.listdir(current_directory)

# Print the list of files
print("Files in the current directory:")
for file_name in files_in_directory:
    print(file_name)

Files in the current directory:
.config
output_dir
sample_data


In [16]:
split_dataset["test"]

Dataset({
    features: ['instruction', 'output', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 128
})

In [18]:
import torch
import pandas as pd
from tqdm import tqdm

# Define the inference function
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # Ensure that pad_token_id is set for open-end generation
    model.config.pad_token_id = model.config.eos_token_id

    # Generate sequences while explicitly setting attention_mask
    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(model.device),
        max_length=max_output_tokens,
        pad_token_id=model.config.eos_token_id,
        attention_mask=input_ids.to(model.device)  # Setting attention mask
    )

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    # Tokenize the generated text
    generated_tokens_answer = tokenizer.encode(generated_text_answer, return_tensors="pt").squeeze()

    return generated_tokens_answer

# Define a function to calculate the loss between predicted and target outputs
def calculate_loss(predicted_tokens, target_tokens):
    # Calculate the loss using a suitable loss function (e.g., CrossEntropyLoss for token-level comparison)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    loss = loss_function(predicted_tokens, target_tokens)

    return loss.item()  # Return the loss value as a scalar


evaluation_dataset = split_dataset["test"]

# Define an empty list to store loss values
losses = []
num_samples_to_process = 10
# Evaluate the loss for each item in the test dataset
for i, item in tqdm(enumerate(evaluation_dataset[:num_samples_to_process])):
    print("i Evaluating: " + str(item))
    question = item['instruction']
    answer = item['output']

    try:
        predicted_tokens = inference(question, final_model, tokenizer)
        target_tokens = tokenizer.encode(answer, return_tensors="pt").squeeze()
        loss_value = calculate_loss(predicted_tokens, target_tokens)
        losses.append(loss_value)
    except:
        continue

# Calculate the average loss across all predictions
average_loss = sum(losses) / len(losses) if losses else 0
print(f"Average Loss: {average_loss}")


0it [00:00, ?it/s]

i Evaluating: instruction





TypeError: ignored

In [21]:
import pandas as pd
from tqdm import tqdm

# Define a function to check exact match between answers
def is_exact_match(a, b):
    return a.strip() == b.strip()

# Define a function for inference
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    tokenizer.pad_token = tokenizer.eos_token
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(model.device),
        max_length=max_output_tokens
    )

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

# Load the evaluation dataset

evaluation_dataset = split_dataset["test"]

# Modify the evaluation loop for your split test dataset
metrics = {'exact_matches': []}
predictions = []

for i, item in tqdm(enumerate(evaluation_dataset)):
    question = item['instruction']
    answer = item['output']

    try:
        predicted_answer = inference(question, final_model, tokenizer)  # Use your trained peft_model and tokenizer
    except:
        continue

    predictions.append([predicted_answer, answer])
    exact_match = is_exact_match(predicted_answer, answer)
    metrics['exact_matches'].append(exact_match)

print('Number of exact matches: ', sum(metrics['exact_matches']))
df = pd.DataFrame(predictions, columns=["predicted_output", "target_output"])
print(df)


0it [00:00, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
1it [00:06,  6.21s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
2it [00:13,  6.59s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
3it [00:17,  5.69s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable r

Number of exact matches:  0
                                      predicted_output  \
0    <html>\n<head>\n    <title>My Website</title>\...   
1    \n<div>\n  <h1>My Website</h1>\n</div>\n\n You...   
2     It should also include a motivational quote a...   
3     The table should have a header row and a foot...   
4    .<!DOCTYPE html>\n<html>\n<head>\n    <title>U...   
..                                                 ...   
123   The page should have a centered title and a c...   
124  \n...[login to view URL] [login to view URL] [...   
125  .<html>\n  <head>\n    <title>Page Counter</ti...   
126   The heading should say "Welcome to my website...   
127   It should also have a section for 'Parent Tip...   

                                         target_output  
0    <html>\n<head>\n <title>My Website</title>\n</...  
1    <!DOCTYPE html>\n<html>\n  <head> \n    <title...  
2    <!DOCTYPE html>\n<html>\n<head>\n<title>Wellne...  
3    <html>\n  <head>\n    <title>Interactive T




In [19]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  tokenizer.pad_token = tokenizer.eos_token
  input_ids = tokenizer.encode(
      text,
      return_tensors="pt",
      truncation=True,
      max_length=max_input_tokens
  )

  # Generate
  ddevice = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

In [22]:
#Generating one HTML code using the model and comapring with Actual data set
test_question = split_dataset['test']['instruction'][1]
generated_answer = inference(test_question, final_model, tokenizer)
print(test_question)
print(generated_answer)
answer = split_dataset['test']["output"][1]
print(answer)



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Rewrite this HTML code so that it is valid HTML5.

<div>
  <h1>My Website</h1>
</div>

 You can use the following code to make it valid HTML5.<div>
  <h1>My Website</h1>
</div>

 I have added the missing tags for you.<div>
  <h1>My Website</h1>
</div>

 You can use the following code
<!DOCTYPE html>
<html>
  <head> 
    <title>Example website</title>
  </head> 
  <body>
    <h1>My example website</h1>
    <p>This is some example text</p>
  </body> 
</html>
