In [2]:
import os
import shutil
if os.path.exists('../fine-tuned-models'):
    shutil.rmtree('../fine-tuned-models')
os.makedirs('../fine-tuned-models')

In [104]:
!ls ../working

test.json		    text-generation-lora-model-v2.zip  val.json
text-generation-lora-model  train.json			       wandb


In [4]:
!pip install torch
!pip install transformers
!pip install datasets
!pip install pandas
!pip install transformers[torch]



In [5]:
!pip install peft



In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [68]:
MODEL_NAME = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
dataset = load_dataset("BrightData/IMDb-Media")
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'popularity', 'genres', 'presentation', 'credit', 'videos', 'photos', 'top_cast', 'details_release_date', 'details_countries_of_origin', 'details_official_site', 'details_language', 'details_also_known_as', 'details_filming_locations', 'details_production_companies', 'specs_color', 'specs_sound_mix', 'specs_aspect_ratio', 'url', 'media_type', 'imdb_rating', 'poster_url', 'imdb_rating_count', 'awards', 'critics_review_count', 'episode_count', 'review_count', 'review_rating', 'featured_review', 'storyline', 'boxoffice_budget'],
        num_rows: 249303
    })
})

In [9]:
rows = []
for item in dataset["train"]:
    rows.append(
        {
            "synopsis": item["storyline"],
        }
    )
df = pd.DataFrame(rows)

In [10]:
df.head()

Unnamed: 0,synopsis
0,The untold story of the first woman freedom fi...
1,Somewhere on the internet is a land where comm...
2,"Follows Amy Powney, a daughter of environmenta..."
3,"With a little help from an angel, Mercedes Wri..."
4,"On the heels of the COVID-19 pandemic, we foll..."


In [11]:
df.isnull().value_counts()

synopsis
False       201629
True         47674
Name: count, dtype: int64

In [12]:
df = df.dropna(subset=['synopsis'])
df.isnull().value_counts()

synopsis
False       201629
Name: count, dtype: int64

In [13]:
df = df[:5000]
df.size

5000

In [14]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(df, test_size=0.2)
val, test = train_test_split(temp, test_size=0.2)

In [15]:
len(train), len(val), len(test)

(4000, 800, 200)

In [16]:
train.sample(n=4000).to_json("train.json", orient="records", lines=True)
val.sample(n=500).to_json("val.json", orient="records", lines=True)
test.sample(n=100).to_json("test.json", orient="records", lines=True)

dataset = load_dataset(
    "json",
    data_files={"train": "train.json", "validation": "val.json", "test": "test.json"},
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['synopsis'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['synopsis'],
        num_rows: 500
    })
    test: Dataset({
        features: ['synopsis'],
        num_rows: 100
    })
})

In [69]:
def tokenize_function(example):
  tokenized_inputs = tokenizer(
      example["synopsis"],
      truncation=True,
      max_length=128,
      return_overflowing_tokens=True
  )

  return tokenized_inputs

In [70]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token':'[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=8, remove_columns=list(df.columns))
tokenized_dataset = tokenized_dataset.remove_columns('overflow_to_sample_mapping')
tokenized_dataset

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7228
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 864
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 184
    })
})

In [71]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [72]:
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
)

In [73]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 147,456 || all params: 82,060,032 || trainable%: 0.1797




In [74]:
lr = 2e-5
num_epochs = 20

training_args = TrainingArguments(
    output_dir="../fine-tuned-models",
    learning_rate=lr,
    num_train_epochs=num_epochs,
    eval_accumulation_steps = 5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [33]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
wnb_token = user_secrets.get_secret("wnb")

wandb.login(key=wnb_token)
run = wandb.init(
    project="Text-Generation-LLM-Voicera",
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrathink4[0m ([33mrathink4_[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [75]:
import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)



In [76]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [77]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
         

In [78]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.451479
2,3.738200,3.405215
3,3.649400,3.385736
4,3.642800,3.374015
5,3.615900,3.365163
6,3.607800,3.358436
7,3.602100,3.352974
8,3.577800,3.349013
9,3.590600,3.34567
10,3.584800,3.342798




TrainOutput(global_step=9040, training_loss=3.5990492272166024, metrics={'train_runtime': 2093.7817, 'train_samples_per_second': 69.043, 'train_steps_per_second': 4.318, 'total_flos': 4737962185850880.0, 'train_loss': 3.5990492272166024, 'epoch': 20.0})

In [79]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
         

In [91]:
model.save_pretrained("../text-generation-lora-model-v2")

In [85]:
pwd

'/kaggle/working'

In [98]:
!ls ..

fine-tuned-models  input  lib  outputs	text-generation-lora-model-v2  working


In [96]:
import os
import shutil
if os.path.exists('../text-generation-lora-model'):
    shutil.rmtree('../text-generation-lora-model')

In [105]:
!zip -r text-generation-lora-modl-v2.zip . ../text-generation-lora-model-v2

  adding: test.json (deflated 73%)
  adding: .virtual_documents/ (stored 0%)
  adding: train.json (deflated 75%)
  adding: text-generation-lora-model-v2.zip (stored 0%)
  adding: text-generation-lora-model/ (stored 0%)
  adding: text-generation-lora-model/adapter_model.safetensors (deflated 7%)
  adding: text-generation-lora-model/README.md (deflated 66%)
  adding: text-generation-lora-model/adapter_config.json (deflated 54%)
  adding: wandb/ (stored 0%)
  adding: wandb/run-20250319_004345-xivn5uo5/ (stored 0%)
  adding: wandb/run-20250319_004345-xivn5uo5/files/ (stored 0%)
  adding: wandb/run-20250319_004345-xivn5uo5/files/wandb-metadata.json (deflated 55%)
  adding: wandb/run-20250319_004345-xivn5uo5/files/config.yaml (deflated 75%)
  adding: wandb/run-20250319_004345-xivn5uo5/files/requirements.txt (deflated 56%)
  adding: wandb/run-20250319_004345-xivn5uo5/files/wandb-summary.json (deflated 46%)
  adding: wandb/run-20250319_004345-xivn5uo5/files/output.log (deflated 90%)
  adding: 

In [87]:
mv ../text-generation-lora-model-v2 /kaggle/outputs/

In [3]:
from peft import AutoPeftModelForCausalLM, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

trained_model = AutoPeftModelForCausalLM.from_pretrained('text-generation-lora-model-v2')

trained_model.to("cpu")


prompt = "A short film about a father and son"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

outputs = trained_model.generate(
    inputs.input_ids,
    max_new_tokens = 100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
)

output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
output_str

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["A short film about a father and son, 'All In', that makes his life so much easier in the world. The father is a scientist who learns that his wife is the one who will make the difference - it will help him make some difference. —Lois N. Domingo (Lois N. Domingo)A short film about a father and son, 'All In', that makes his life so much easier in the world. The father is a scientist who learns that his wife is the one who will make"]