# WCC Implementaion, Fine-Tuning

## Imports

In [3]:
!pip install -q accelerate peft bitsandbytes transformers trl

In [4]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd

import matplotlib.pyplot as plt

from pathlib import Path

In [5]:
DATA_PATH = Path("/kaggle/input/nlp-project")
OUTPUT_PATH = Path("/kaggle/working/")

## Model

In [27]:
model_name = "MBZUAI/LaMini-Neo-125M"
new_model = "LaMini-Neo-125M-wcc_2"
max_context = 4096 # Update according to a model you use``

## Data Preparation

In [8]:
sentences = pd.read_csv(DATA_PATH / "sentences.csv")
sentences['Sentence'] = sentences['Sentence'].astype(str)
print(f"Total sentences before filtering: {len(sentences)}")
sentences = sentences[sentences["Word Count"] < 25]
print(f"Total sentences after filtering: {len(sentences)}")
sentences.head()

Total sentences before filtering: 456027
Total sentences after filtering: 339393


Unnamed: 0,Sentence,Word Count,Paragraph ID
0,A magazine supplement with an image of Adolf H...,19,0
1,"No law bans “Mein Kampf” in Germany, but the g...",19,0
2,(Thomas Peter/REUTERS),2,0
4,"What it doesn’t have, nor has it since 1945, a...",22,0
11,"“Maybe it was necessary once, but now it’s ove...",13,0


In [9]:
def add_wcc_token(sentence, word_count):
     return f"<{word_count}>{sentence}"

In [10]:
sentences['Sentence_with_WCC'] = sentences.apply(lambda row: add_wcc_token(row['Sentence'], row['Word Count']), axis=1)

In [11]:
paragraphs = sentences.groupby('Paragraph ID').agg({
    'Sentence_with_WCC': ' '.join,
    'Word Count': 'sum',
}).reset_index()

In [13]:
max_len = sentences["Word Count"].max()

In [17]:
print(f"Total paragraphs before filtering: {len(paragraphs)}")
paragraphs = paragraphs[paragraphs["Word Count"] < max_context]
print(f"Total paragraphs after filtering: {len(paragraphs)}")

Total paragraphs before filtering: 9985
Total paragraphs after filtering: 9936


In [18]:
def add_wcc_token_to_examples(examples):
    examples['Sentence'] = [add_wcc_token(sentence, wc) for sentence, wc in zip(examples['Sentence'], examples['Word Count'])]
    return examples

In [19]:
dataset = Dataset.from_pandas(paragraphs.drop(columns=["Paragraph ID"]))
next(iter(dataset))

{'Sentence_with_WCC': "<19>A magazine supplement with an image of Adolf Hitler and the title 'The Unreadable Book' is pictured in Berlin. <19>No law bans “Mein Kampf” in Germany, but the government of Bavaria, holds the copyright and guards it ferociously. <2>(Thomas Peter/REUTERS) <22>What it doesn’t have, nor has it since 1945, are copies of Hitler’s autobiography and political manifesto, “Mein Kampf,” in its bookstores. <13>“Maybe it was necessary once, but now it’s over, it makes no sense. <6>You can find it so easily.” <16>The publisher of the excerpts, London-based Albertus, has said it will appeal the Bavarian government’s injunction. <13>This idea is just naive,” said Alexander Luckow, a spokesman for the publisher. <15>“In a free country, you need to discuss these very bad parts of German history.” <22>Still, he said, there are limits, and using Hitler’s words as inspiration, not as historical artifact, is where it crosses the line. <17>“The danger is allowing right-wing peopl

## Define Training Parameters

In [20]:
lora_r = 128


lora_alpha = 32


lora_dropout = 0.05

In [21]:
use_4bit = True


bnb_4bit_compute_dtype = "float16"

bnb_4bit_quant_type = "nf4"

use_nested_quant = False

In [23]:
output_dir = OUTPUT_PATH / "results"


num_train_epochs = 10


fp16 = False
bf16 = False


per_device_train_batch_size = 16


per_device_eval_batch_size = 1


gradient_accumulation_steps = 2


gradient_checkpointing = True


max_grad_norm = 0.3


learning_rate = 1e-4


weight_decay = 0.001


optim = "paged_adamw_32bit"


lr_scheduler_type = "cosine"


max_steps = 1_000

warmup_ratio = 0.03


group_by_length = True


save_steps = 500


logging_steps = 25

max_seq_length = max_context


packing = False

device_map = "auto"

In [24]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [28]:
print("Initial tokenizer size:", len(tokenizer))
print("Max len: ", max_len)

additional_tokens = [f"<{i}>" for i in range(max_len + 1)]
tokenizer.add_tokens(additional_tokens)

print("Final tokenizer size:", len(tokenizer))

Initial tokenizer size: 50283
Max len:  24
Final tokenizer size: 50283


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.resize_token_embeddings(len(tokenizer))

In [31]:
print(model)

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50283, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear4bit(in_features=768, out_features=768, bias=False)
            (v_proj): Linear4bit(in_features=768, out_features=768, bias=False)
            (q_proj): Linear4bit(in_features=768, out_features=768, bias=False)
            (out_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c

In [34]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [35]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [36]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [37]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="Sentence_with_WCC",  # this is the text column in dataset
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

## Train Model and Save

In [None]:
trainer.train()

Step,Training Loss


In [None]:
n = 4
batch = tokenizer(f"<{n}>", return_tensors='pt')
batch = {k: v.to('cuda') for k, v in batch.items()}

num_beams = 10 
length_penalty = 6
max_length = 100
repetition_penalty = 4.0

with torch.cuda.amp.autocast():
  output = model.generate(
    **batch, 
    max_length=max_length, 
    num_beams=num_beams, 
    length_penalty=length_penalty,
    repetition_penalty=repetition_penalty, 
    )

print('\n\n', tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
# Save trained model
trainer.model.save_pretrained(OUTPUT_PATH / new_model)