In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import transformers

from tqdm import tqdm
import os
from torch.utils.tensorboard import SummaryWriter
import datetime

from transformers import AutoTokenizer, AutoModelForCausalLM
# --- Config ---
model_dir = "meta-llama/Llama-3.2-1B-Instruct"
device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

batch_size = 8
num_epochs = 1
learning_rate = 5e-5
max_length = 512
gradient_accumulation_steps = 4
save_steps = 1000
log_interval = 10  # Log every 10 iterations
output_dir = "charllama-finetuned"  # Directory to save checkpoints
log_dir = os.path.join("runs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
writer = SummaryWriter(log_dir=log_dir)

# --- Dataset class ---
class CharDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sentences = self._load_and_preprocess(file_path)

    def _load_and_preprocess(self, file_path):
        with open(file_path, "r") as f:
            lines = f.readlines()
        return [line.strip().upper() for line in lines if len(line.strip()) > 10]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        sentence = " ".join(sentence)
        tokens = self.tokenizer(
                        sentence,
                        max_length=max_length,
                        padding='max_length',
                        truncation=True,
                        return_tensors='pt',
                        add_special_tokens=False  # <-- This disables special tokens
                    )
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)
        labels = input_ids.clone()
        labels[attention_mask == 0] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# --- Load tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Load model, optimizer, scheduler, and scaler from checkpoint (if available) ---
checkpoint_path = None  # Set to the checkpoint directory if you want to resume training
model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)
# Count trainable parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {num_params / 1e6:.2f}M")

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last 2 transformer layers
transformer_layers = model.model.layers  # Access the transformer block list
num_layers = len(transformer_layers)

for i in range(num_layers - 2, num_layers):
    for param in transformer_layers[i].parameters():
        param.requires_grad = True

# Also unfreeze the final language model head (optional but often useful)
for param in model.lm_head.parameters():
    param.requires_grad = True

for param in model.model.norm.parameters():
    param.requires_grad = True
    

# Count trainable parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {num_params / 1e6:.2f}M")



optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate
)
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # Will be updated if loading from checkpoint
    num_training_steps=1 # Initialize with a non-zero value to avoid the check if loading
)
scaler = torch.cuda.amp.GradScaler()
global_step = 0
start_epoch = 0


Model parameters: 1235.81M
Trainable parameters: 384.31M


  scaler = torch.cuda.amp.GradScaler()


In [3]:
os.makedirs(output_dir, exist_ok=True)
print("Starting training from scratch.")
train_dataset_temp = CharDataset("/raid/home/rajivratn/hemant_rajivratn/last/data/txt/train.wrd", tokenizer, max_length)
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=len(DataLoader(train_dataset_temp, batch_size=batch_size)) // 10,
    num_training_steps=len(DataLoader(train_dataset_temp, batch_size=batch_size)) * num_epochs // gradient_accumulation_steps
    )


Starting training from scratch.


In [None]:
# --- Create dataset and dataloader ---
train_file_path = "/raid/home/rajivratn/hemant_rajivratn/last/data/txt/train_norm.txt"
dataset = CharDataset(train_file_path, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)

# --- Training Loop ---
model.zero_grad()
for epoch in range(start_epoch, num_epochs):
    total_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), initial=global_step % len(dataloader) if start_epoch == epoch else 0, desc=f"Epoch {epoch + 1}/{num_epochs}")
    for step, batch in progress_bar:
        if start_epoch == epoch and step < global_step % len(dataloader):
            continue  # Skip steps already done in the previous run

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            )
            logits = outputs.logits[:, :-1, :].contiguous()
            labels = labels[:, 1:].contiguous()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        loss = loss / gradient_accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            global_step += 1

            # Log loss and learning rate to TensorBoard every log_interval steps
            if global_step % log_interval == 0:
                writer.add_scalar('loss/step', loss.item() * gradient_accumulation_steps, global_step)
                writer.add_scalar('learning_rate', scheduler.get_last_lr()[0], global_step)

            if global_step % save_steps == 0:
                checkpoint_dir = os.path.join(output_dir, f"checkpoint")
                os.makedirs(checkpoint_dir, exist_ok=True)
                model.save_pretrained(checkpoint_dir)
                tokenizer.save_pretrained(checkpoint_dir)
                torch.save(optimizer.state_dict(), os.path.join(checkpoint_dir, 'optimizer.pt'))
                torch.save(scheduler.state_dict(), os.path.join(checkpoint_dir, 'scheduler.pt'))
                torch.save(scaler.state_dict(), os.path.join(checkpoint_dir, 'scaler.pt'))
                print(f"Checkpoint saved at step {global_step} to {checkpoint_dir}")

        total_loss += loss.item() * gradient_accumulation_steps
        progress_bar.set_postfix({"loss": f"{total_loss / (step + 1):.4f}", "lr": f"{scheduler.get_last_lr()[0]:.6f}", "step": global_step + 1})

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")
    writer.add_scalar('loss/epoch', avg_loss, epoch + 1)

# --- Save the final trained model ---
final_output_dir = os.path.join(output_dir, "final-model")
os.makedirs(final_output_dir, exist_ok=True)
model.save_pretrained(final_output_dir)
tokenizer.save_pretrained(final_output_dir)
print(f"Final model saved to {final_output_dir}")

# --- Close TensorBoard writer ---
writer.close()
print(f"TensorBoard logs saved to {log_dir}")
print("To view TensorBoard logs, run: `tensorboard --logdir runs` from your terminal.")

  with torch.cuda.amp.autocast():
Epoch 1/1:   0%|          | 80/35132 [00:20<2:18:57,  4.20it/s, loss=2.2199, lr=0.000000, step=21]

tensor([    46,    452,    256,    362,    445,    445,    256,    393,    507,
           445,    358,    350,    358,    356,    362,    445,    256,    362,
           452,    423,    256,    445,    358,    350,    469,    432,    362,
           432,    816,    256,   1229,    549,    469,    328,    350,    358,
           507,    452,    328,    256,    350,    473,    469,    256,    386,
           358,    452,    358,    328,    350,    469,    432,    256,    468,
           362,    328,    256,    350,    473,    469,    256,    507,    432,
           362,    356,    445,    469,    256,    507,    435,    256,    473,
           358,    328,    256,    507,    468,    452,    256,    356,    358,
           432,    356,    445,    469,    256,    358,    350,    256,    468,
           362,    328,    256,    393,    507,    393,    549,    445,    362,
           432,    445,    816,    256,    432,    469,    386,    362,    432,
           735,    469,    423,    256, 

'O N   A L L   P O L I T I C A L   A N D   L I T E R A R Y   Q U E S T I O N S   T H E   M I N I S T E R   W A S   T H E   O R A C L E   O F   H I S   O W N   C I R C L E   I T   W A S   P O P U L A R L Y   R E M A R K E D   D U R I N G   M A N Y   Y E A R S   T H A T   A N   E M I N E N T   D I S S E N T I N G   M I N I S T E R   H A D   O N L Y   T O   M A K E   H I S   S O N   A N   A T T O R N E Y   O R   A   P H Y S I C I A N'

512

217

'ON ALL POLITICAL AND LITERARY QUESTIONS THE MINISTER WAS THE ORACLE OF HIS OWN CIRCLE IT WAS POPULARLY REMARKED DURING MANY YEARS THAT AN EMINENT DISSENTING MINISTER HAD ONLY TO MAKE HIS SON AN ATTORNEY OR A PHYSICIAN'

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

       
model_name="meta-llama/Llama-3.2-1B-Instruct"
# Initialize the model and tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)
# Freeze LLM parameters
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last 2 transformer layers
transformer_layers = model.model.layers  # Access the transformer block list
num_layers = len(transformer_layers)

for i in range(num_layers - 2, num_layers):
    for param in transformer_layers[i].parameters():
        param.requires_grad = True

# Also unfreeze the final language model head (optional but often useful)
for param in model.lm_head.parameters():
    param.requires_grad = True

for param in model.model.norm.parameters():
    param.requires_grad = True
    
    
# device = torch.device("cpu")
# model.to(device)
# model.train()

In [8]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)


model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.weight False
model.layers.0.self_attn.k_proj.weight False
model.layers.0.self_attn.v_proj.weight False
model.layers.0.self_attn.o_proj.weight False
model.layers.0.mlp.gate_proj.weight False
model.layers.0.mlp.up_proj.weight False
model.layers.0.mlp.down_proj.weight False
model.layers.0.input_layernorm.weight False
model.layers.0.post_attention_layernorm.weight False
model.layers.1.self_attn.q_proj.weight False
model.layers.1.self_attn.k_proj.weight False
model.layers.1.self_attn.v_proj.weight False
model.layers.1.self_attn.o_proj.weight False
model.layers.1.mlp.gate_proj.weight False
model.layers.1.mlp.up_proj.weight False
model.layers.1.mlp.down_proj.weight False
model.layers.1.input_layernorm.weight False
model.layers.1.post_attention_layernorm.weight False
model.layers.2.self_attn.q_proj.weight False
model.layers.2.self_attn.k_proj.weight False
model.layers.2.self_attn.v_proj.weight False
model.layers.2.self_attn.o_proj.

In [24]:
input_texts = [
    "Hemant is my name",
    "Heeemmmannnnt iiiis mmmmy n naaame ee",
    "DRNOT TL KTCRNT ST RCT S C CDR C T DGT RM RTCETPCR RT T DC GESDT RESDE R DR GCECGCORGT EFLERC NCGKE EDES EDCD LTKT R DERT TS F DNSL GRKTSNG RKRDGCTRLS CG NSTGT RT CNTR LTERTEPRDRLSHS DEL O T RTS TN T GL ETGTGS ET GCG DLS T GD GRGSTLSKST TC T RDRETD T L TOLGSNLTREGTE SRTGR SRE CN GT NKR RLGR CG DT G RDG CP DESCTGRT CEDT SLTRS T G GLG G RG TP D LE VDV ET LRU CR CSE GTL T LT RTDTG CRTS R DTSC DT NE"
]

input_texts = [" ".join(i.upper()) for i in input_texts ]
input_texts

['H E M A N T   I S   M Y   N A M E',
 'H E E E M M M A N N N N T   I I I I S   M M M M Y   N   N A A A M E   E E',
 'D R N O T   T L   K T C R N T   S T   R C T   S   C   C D R   C   T   D G T   R M   R T C E T P C R   R T   T   D C   G E S D T   R E S D E   R   D R   G C E C G C O R G T   E F L E R C   N C G K E   E D E S   E D C D   L T K T   R   D E R T   T S   F   D N S L   G R K T S N G   R K R D G C T R L S   C G   N S T G T   R T   C N T R   L T E R T E P R D R L S H S   D E L   O   T   R T S   T N   T   G L   E T G T G S   E T   G C G   D L S   T   G D   G R G S T L S K S T   T C   T   R D R E T D   T   L   T O L G S N L T R E G T E   S R T G R   S R E   C N   G T   N K R   R L G R   C G   D T   G   R D G   C P   D E S C T G R T   C E D T   S L T R S   T   G   G L G   G   R G   T P   D   L E   V D V   E T   L R U   C R   C S E   G T L   T   L T   R T D T G   C R T S   R   D T S C   D T   N E']

In [25]:
o  = tokenizer(
            input_texts[0],
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            add_special_tokens=False  # <-- This disables special tokens
        )
o, tokenizer.decode(o['input_ids'][0].tolist() , skip_special_tokens=True)

({'input_ids': tensor([[    39,    469,    386,    362,    452,    350,    256,    358,    328,
             256,    386,    816,    256,    452,    362,    386,    469, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
          128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
  

In [29]:
len("HEMANT IS MY NAME")

17

In [31]:
len([    39,    469,    386,    362,    452,    350,    256,    358,    328, 256,    386,    816,    256,    452,    362,    386,    469])

17