Download dataset and save in the drive and then mount the drive to colab for better workflow than uploading file in every runtime.


In [1]:
# Mount Google Drive to save/load model
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Install required libraries
!pip install torch tokenizers accelerate -U
!pip install rouge_score
!pip install torch tokenizers

# Install ngrok for tunneling
!pip install pyngrok

# Install required packages
!pip install streamlit tokenizers rouge-score

# Install localtunnel
!npm install -g localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K
added 22 packages in 2s
[1G[0K⠧[1G[0K
[1G[0K⠧[1G[0K3 packages are looking for funding
[1G[0K⠧[1G[0K  run `npm fund` for details
[1G[0K⠧[1G[0K

# Small Model with less accuracy (30 Epochs)

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer

# Step 1: Load and prepare the dataset
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})

# Step 2: Train a WordPiece tokenizer on the dataset
texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=30000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train_from_iterator(texts, trainer=trainer)

# Save tokenizer
tokenizer_path = "/content/drive/MyDrive/Transformer"
os.makedirs(tokenizer_path, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_path, "tokenizer.json"))

# Encode dataset
def encode_texts(texts, summaries, max_input_length=512, max_target_length=128):
    input_encodings = []
    target_encodings = []
    for text, summary in zip(texts, summaries):
        input_ids = tokenizer.encode(text).ids
        target_ids = tokenizer.encode(summary).ids
        if len(input_ids) > max_input_length:
            input_ids = input_ids[:max_input_length]
        if len(target_ids) > max_target_length:
            target_ids = target_ids[:max_target_length]
        input_encodings.append(input_ids)
        target_encodings.append(target_ids)
    return input_encodings, target_encodings

input_encodings, target_encodings = encode_texts(df['text'], df['summary'])

# Pad sequences
def pad_sequences(sequences, max_length, pad_token_id):
    padded = []
    for seq in sequences:
        if len(seq) < max_length:
            seq = seq + [pad_token_id] * (max_length - len(seq))
        padded.append(seq[:max_length])
    return padded

max_input_length = 512
max_target_length = 128
pad_token_id = tokenizer.token_to_id("[PAD]")
input_encodings = pad_sequences(input_encodings, max_input_length, pad_token_id)
target_encodings = pad_sequences(target_encodings, max_target_length, pad_token_id)

# Convert to tensors
input_tensors = torch.tensor(input_encodings, dtype=torch.long)
target_tensors = torch.tensor(target_encodings, dtype=torch.long)

# Split into train and validation (80-20)
train_size = int(0.8 * len(input_tensors))
train_inputs, val_inputs = input_tensors[:train_size], input_tensors[train_size:]
train_targets, val_targets = target_tensors[:train_size], target_tensors[train_size:]

# Step 3: Define Transformer model from scratch
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, src_mask, tgt_mask)
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Instantiate model
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).cuda()

# Step 4: Training setup
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
scaler = GradScaler()

# Create dataset and dataloader
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.targets[idx]}

train_dataset = MedicalDataset(train_inputs, train_targets)
val_dataset = MedicalDataset(val_inputs, val_targets)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4)

# Step 5: Training loop
def train_epoch(model, loader, optimizer, criterion, scaler):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        src = batch['input_ids'].cuda()
        tgt = batch['labels'].cuda()
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

        with autocast():
            output = model(src, tgt_input, tgt_mask=tgt_mask)
            loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            src = batch['input_ids'].cuda()
            tgt = batch['labels'].cuda()
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

            with autocast():
                output = model(src, tgt_input, tgt_mask=tgt_mask)
                loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
            total_loss += loss.item()

            # Decode for ROUGE
            preds = torch.argmax(output, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(tgt_output.cpu().numpy())
    return total_loss / len(loader), all_preds, all_labels

# Step 6: Train the model
num_epochs = 50  # More epochs due to training from scratch
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler)
    val_loss, val_preds, val_labels = evaluate(model, val_loader, criterion)

    # Compute ROUGE scores
    decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in val_preds]
    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in val_labels]
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(label, pred) for pred, label in zip(decoded_preds, decoded_labels)]
    rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, ROUGE-1: {rouge1:.4f}")

# Step 7: Save the model
model_path = "/content/drive/MyDrive/Transformer"
os.makedirs(model_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(model_path, "model.pt"))
print(f"Model saved to: {model_path}")

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1, Train Loss: 6.7389, Val Loss: 6.0394, ROUGE-1: 0.1594
Epoch 2, Train Loss: 5.4838, Val Loss: 5.3924, ROUGE-1: 0.1895
Epoch 3, Train Loss: 4.7665, Val Loss: 4.9051, ROUGE-1: 0.1689
Epoch 4, Train Loss: 4.1942, Val Loss: 4.5177, ROUGE-1: 0.2152
Epoch 5, Train Loss: 3.6936, Val Loss: 4.1553, ROUGE-1: 0.1724
Epoch 6, Train Loss: 3.2391, Val Loss: 3.8263, ROUGE-1: 0.2582
Epoch 7, Train Loss: 2.8137, Val Loss: 3.4976, ROUGE-1: 0.2463
Epoch 8, Train Loss: 2.4272, Val Loss: 3.2177, ROUGE-1: 0.2285
Epoch 9, Train Loss: 2.0613, Val Loss: 2.9594, ROUGE-1: 0.2733
Epoch 10, Train Loss: 1.7543, Val Loss: 2.6839, ROUGE-1: 0.2976
Epoch 11, Train Loss: 1.4668, Val Loss: 2.4353, ROUGE-1: 0.3854
Epoch 12, Train Loss: 1.2387, Val Loss: 2.2166, ROUGE-1: 0.3432
Epoch 13, Train Loss: 1.0515, Val Loss: 2.0342, ROUGE-1: 0.5252
Epoch 14, Train Loss: 0.8718, Val Loss: 1.8536, ROUGE-1: 0.3287
Epoch 15, Train Loss: 0.7507, Val Loss: 1.7251, ROUGE-1: 0.3973
Epoch 16, Train Loss: 0.6383, Val Loss: 1.5869, R

In [None]:
import torch
from tokenizers import Tokenizer
import os
from IPython.display import display, HTML
from ipywidgets import widgets

# Load tokenizer
tokenizer_path = "/content/drive/MyDrive/Transformer/tokenizer.json"
tokenizer = Tokenizer.from_file(tokenizer_path)

# Define Transformer model (same as training)
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(torch.nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = torch.nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = torch.nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        tgt = self.embedding(tgt) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, src_mask, tgt_mask)
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Load model
model_path = "/content/drive/MyDrive/Transformer/model.pt"
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).cuda()
model.load_state_dict(torch.load(model_path))
model.eval()

# Function to generate summary
def generate_summary(text, max_length=128):
    # Encode input
    input_ids = tokenizer.encode(text).ids
    input_ids = input_ids[:512]  # Truncate to max input length
    input_ids = input_ids + [tokenizer.token_to_id("[PAD]")] * (512 - len(input_ids))
    input_tensor = torch.tensor([input_ids], dtype=torch.long).cuda()

    # Generate output
    generated_ids = [tokenizer.token_to_id("[CLS]")]
    for _ in range(max_length):
        tgt_tensor = torch.tensor([generated_ids], dtype=torch.long).cuda()
        tgt_mask = model.generate_square_subsequent_mask(tgt_tensor.size(1)).cuda()
        with torch.no_grad():
            output = model(input_tensor, tgt_tensor, tgt_mask=tgt_mask)
        next_token = torch.argmax(output[:, -1, :], dim=-1).item()
        generated_ids.append(next_token)
        if next_token == tokenizer.token_to_id("[SEP]"):
            break

    # Decode output
    summary = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return summary

# Option 1: Colab Form Input (recommended for presentation)
text_input = widgets.Textarea(
    value='',
    placeholder='Paste or type your medical transcript here...',
    description='Input Text:',
    layout={'width': '600px', 'height': '200px'}
)
output_label = widgets.Label(value="Generated Summary: ")
output_text = widgets.Textarea(
    value='',
    disabled=True,
    layout={'width': '600px', 'height': '100px'}
)
button = widgets.Button(description="Generate Summary")

def on_button_clicked(b):
    if text_input.value.strip():
        summary = generate_summary(text_input.value)
        output_text.value = summary
    else:
        output_text.value = "Please enter some text to summarize."

button.on_click(on_button_clicked)

# Display form
display(text_input, button, output_label, output_text)

Textarea(value='', description='Input Text:', layout=Layout(height='200px', width='600px'), placeholder='Paste…

Button(description='Generate Summary', style=ButtonStyle())

Label(value='Generated Summary: ')

Textarea(value='', disabled=True, layout=Layout(height='100px', width='600px'))

# Large Model with more accuracy (100 Epochs)

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer
from torch.optim.lr_scheduler import CosineAnnealingLR

# Step 1: Load and prepare the dataset
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})

# Append [SEP] to summaries for consistent training
df['summary'] = df['summary'].apply(lambda x: x + ' [SEP]')

# Step 2: Train a WordPiece tokenizer on the dataset
texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=30000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train_from_iterator(texts, trainer=trainer)

# Save tokenizer
tokenizer_path = "/content/drive/MyDrive/Transformer"
os.makedirs(tokenizer_path, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_path, "tokenizer.json"))

# Encode dataset
def encode_texts(texts, summaries, max_input_length=512, max_target_length=256):
    input_encodings = []
    target_encodings = []
    for text, summary in zip(texts, summaries):
        input_ids = tokenizer.encode(text).ids
        target_ids = tokenizer.encode(summary).ids
        input_ids = input_ids[:max_input_length]
        target_ids = target_ids[:max_target_length]
        input_encodings.append(input_ids)
        target_encodings.append(target_ids)
    return input_encodings, target_encodings

input_encodings, target_encodings = encode_texts(df['text'], df['summary'])

# Pad sequences
def pad_sequences(sequences, max_length, pad_token_id):
    padded = []
    for seq in sequences:
        if len(seq) < max_length:
            seq = seq + [pad_token_id] * (max_length - len(seq))
        padded.append(seq[:max_length])
    return padded

max_input_length = 512
max_target_length = 256
pad_token_id = tokenizer.token_to_id("[PAD]")
input_encodings = pad_sequences(input_encodings, max_input_length, pad_token_id)
target_encodings = pad_sequences(target_encodings, max_target_length, pad_token_id)

# Convert to tensors
input_tensors = torch.tensor(input_encodings, dtype=torch.long)
target_tensors = torch.tensor(target_encodings, dtype=torch.long)

# Split into train and validation (80-20)
train_size = int(0.8 * len(input_tensors))
train_inputs, val_inputs = input_tensors[:train_size], input_tensors[train_size:]
train_targets, val_targets = target_tensors[:train_size], target_tensors[train_size:]

# Step 3: Define Transformer model from scratch
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, src_mask, tgt_mask)
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Instantiate model
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).cuda()

# Step 4: Training setup
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id, label_smoothing=0.1)
scaler = GradScaler()
scheduler = CosineAnnealingLR(optimizer, T_max=100)

# Create dataset and dataloader
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.targets[idx]}

train_dataset = MedicalDataset(train_inputs, train_targets)
val_dataset = MedicalDataset(val_inputs, val_targets)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4)

# Step 5: Training loop
def train_epoch(model, loader, optimizer, criterion, scaler, scheduler):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        src = batch['input_ids'].cuda()
        tgt = batch['labels'].cuda()
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

        with autocast():
            output = model(src, tgt_input, tgt_mask=tgt_mask)
            loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    scheduler.step()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            src = batch['input_ids'].cuda()
            tgt = batch['labels'].cuda()
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

            with autocast():
                output = model(src, tgt_input, tgt_mask=tgt_mask)
                loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
            total_loss += loss.item()

            preds = torch.argmax(output, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(tgt_output.cpu().numpy())
    return total_loss / len(loader), all_preds, all_labels

# Step 6: Train the model
num_epochs = 100
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, scheduler)
    val_loss, val_preds, val_labels = evaluate(model, val_loader, criterion)

    decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in val_preds]
    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in val_labels]
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(label, pred) for pred, label in zip(decoded_preds, decoded_labels)]
    rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, ROUGE-1: {rouge1:.4f}")

# Step 7: Save the model
model_path = "/content/drive/MyDrive/Transformer"
os.makedirs(model_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(model_path, "model.pt"))
print(f"Model saved to: {model_path}")

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1, Train Loss: 7.2693, Val Loss: 7.1810, ROUGE-1: 0.0000
Epoch 2, Train Loss: 7.0758, Val Loss: 7.1733, ROUGE-1: 0.0000
Epoch 3, Train Loss: 7.0671, Val Loss: 7.1746, ROUGE-1: 0.0000
Epoch 4, Train Loss: 7.0607, Val Loss: 7.1676, ROUGE-1: 0.0000
Epoch 5, Train Loss: 7.0545, Val Loss: 7.1583, ROUGE-1: 0.0000
Epoch 6, Train Loss: 7.0524, Val Loss: 7.1676, ROUGE-1: 0.0000
Epoch 7, Train Loss: 7.0482, Val Loss: 7.1510, ROUGE-1: 0.0000
Epoch 8, Train Loss: 7.0468, Val Loss: 7.1586, ROUGE-1: 0.0000
Epoch 9, Train Loss: 7.0473, Val Loss: 7.1486, ROUGE-1: 0.0000
Epoch 10, Train Loss: 7.0120, Val Loss: 7.1789, ROUGE-1: 0.0000
Epoch 11, Train Loss: 7.0109, Val Loss: 7.2699, ROUGE-1: 0.0000
Epoch 12, Train Loss: 6.9970, Val Loss: 7.2615, ROUGE-1: 0.0000
Epoch 13, Train Loss: 6.9787, Val Loss: 7.3851, ROUGE-1: 0.0000
Epoch 14, Train Loss: 6.9649, Val Loss: 7.4183, ROUGE-1: 0.0000
Epoch 15, Train Loss: 6.9592, Val Loss: 7.3545, ROUGE-1: 0.0000
Epoch 16, Train Loss: 6.9485, Val Loss: 7.5118, R

# Trial 4

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer

# =================================== 1. Load data ===================================
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})
print(f"Loaded {len(df)} samples")

# =================================== 2. Train tokenizer ===================================
texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=30000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train_from_iterator(texts, trainer)

save_dir = "/content/drive/MyDrive/Transformer"
os.makedirs(save_dir, exist_ok=True)
tokenizer.save(os.path.join(save_dir, "tokenizer4.json"))

CLS_ID = tokenizer.token_to_id("[CLS]")
SEP_ID = tokenizer.token_to_id("[SEP]")
PAD_ID = tokenizer.token_to_id("[PAD]")
print(f"Special tokens → CLS: {CLS_ID}, SEP: {SEP_ID}, PAD: {PAD_ID}")

# =================================== 3. CORRECT ENCODING (THIS IS THE FIX) ===================================
def encode_pair(text, summary):
    src = tokenizer.encode(text).ids[:512]
    tgt_ids = tokenizer.encode(summary).ids
    tgt = [CLS_ID] + tgt_ids + [SEP_ID]           # BOS + summary + EOS
    tgt = tgt[:256]
    return src, tgt

inputs, targets = [], []
for text, summary in zip(df['text'], df['summary']):
    src, tgt = encode_pair(text, summary)
    inputs.append(src + [PAD_ID] * (512 - len(src)))
    targets.append(tgt + [PAD_ID] * (256 - len(tgt)))

inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Train/val split
train_size = int(0.8 * len(inputs))
train_in, val_in = inputs[:train_size], inputs[train_size:]
train_tgt, val_tgt = targets[:train_size], targets[train_size:]

# =================================== 4. Model (same as Streamlit) ===================================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class TransformerSummarizer(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_layers,
                                          num_decoder_layers=num_layers,
                                          dim_feedforward=2048, dropout=0.1,
                                          batch_first=True)
        self.out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, tgt_mask=None):
        src = self.embed(src) * math.sqrt(self.d_model)
        tgt = self.embed(tgt) * math.sqrt(self.d_model)
        src = self.pos(src)
        tgt = self.pos(tgt)
        out = self.transformer(src, tgt, tgt_mask=tgt_mask)
        return self.out(out)

    def generate_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

model = TransformerSummarizer(vocab_size=tokenizer.get_vocab_size()).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
scaler = GradScaler()

# =================================== 5. Training loop ===================================
@torch.no_grad()
def validate():
    model.eval()
    total_loss = 0
    scorer = rouge_scorer.RougeScorer(['rouge1','rougeL'])
    r1 = []
    for i in range(0, len(val_in), 4):
        src = val_in[i:i+4].cuda()
        tgt = val_tgt[i:i+4].cuda()
        tgt_in = tgt[:, :-1]
        with autocast():
            out = model(src, tgt_in, model.generate_mask(tgt_in.size(1)).cuda())
            loss = criterion(out.reshape(-1, out.size(-1)), tgt[:, 1:].reshape(-1))
        total_loss += loss.item()
        pred = out.argmax(-1)
        for p, t in zip(pred, tgt):
            r1.append(scorer.score(tokenizer.decode(t[1:].tolist(), skip_special_tokens=True),
                                 tokenizer.decode(p.tolist(), skip_special_tokens=True))['rouge1'].fmeasure)
    return total_loss / (len(val_in)//4), sum(r1)/len(r1)

for epoch in range(1, 31):
    model.train()
    total = 0
    for i in range(0, len(train_in), 4):
        optimizer.zero_grad()
        src = train_in[i:i+4].cuda()
        tgt = train_tgt[i:i+4].cuda()
        tgt_in = tgt[:, :-1]
        mask = model.generate_mask(tgt_in.size(1)).cuda()
        with autocast():
            out = model(src, tgt_in, mask)
            loss = criterion(out.reshape(-1, out.size(-1)), tgt[:, 1:].reshape(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total += loss.item()
    val_loss, rouge1 = validate()
    print(f"Epoch {epoch:3d} | Train Loss: {total/(len(train_in)//4):.4f} | Val Loss: {val_loss:.4f} | ROUGE-1: {rouge1:.4f}")

# =================================== 6. Save ===================================
torch.save(model.state_dict(), os.path.join(save_dir, "model4.pt"))
print("FINAL MODEL & TOKENIZER SAVED — READY FOR STREAMLIT")

Loaded 4966 samples
Special tokens → CLS: 2, SEP: 3, PAD: 0


  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch   1 | Train Loss: 7.2881 | Val Loss: 7.0654 | ROUGE-1: 0.0793
Epoch   2 | Train Loss: 6.8368 | Val Loss: 7.0387 | ROUGE-1: 0.0645
Epoch   3 | Train Loss: 6.7462 | Val Loss: 7.0307 | ROUGE-1: 0.0641
Epoch   4 | Train Loss: 6.8028 | Val Loss: 7.0553 | ROUGE-1: 0.0350
Epoch   5 | Train Loss: 6.6807 | Val Loss: 7.0373 | ROUGE-1: 0.0752
Epoch   6 | Train Loss: 6.6993 | Val Loss: 7.2443 | ROUGE-1: 0.0342
Epoch   7 | Train Loss: 6.8457 | Val Loss: 7.2347 | ROUGE-1: 0.0617
Epoch   8 | Train Loss: 6.8278 | Val Loss: 7.2899 | ROUGE-1: 0.0350
Epoch   9 | Train Loss: 6.8518 | Val Loss: 7.2959 | ROUGE-1: 0.0767
Epoch  10 | Train Loss: 6.8469 | Val Loss: 7.4409 | ROUGE-1: 0.0000
Epoch  11 | Train Loss: 6.9917 | Val Loss: 7.4249 | ROUGE-1: 0.0000
Epoch  12 | Train Loss: 6.9918 | Val Loss: 7.5114 | ROUGE-1: 0.0000
Epoch  13 | Train Loss: 6.9680 | Val Loss: 7.4982 | ROUGE-1: 0.0000
Epoch  14 | Train Loss: 6.9241 | Val Loss: 7.5216 | ROUGE-1: 0.0000
Epoch  15 | Train Loss: 6.9311 | Val Loss: 7.526

# Greedy Seach

In [None]:
# FINAL TRAINING NOTEBOOK — 100% WORKING (syntax fixed)
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer

# ====================== 1. Load Data ======================
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})
print(f"Loaded {len(df)} samples")

# ====================== 2. Train Tokenizer ======================
texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=30000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train_from_iterator(texts, trainer)

save_dir = "/content/drive/MyDrive/Transformer"
os.makedirs(save_dir, exist_ok=True)
tokenizer.save(os.path.join(save_dir, "tokenizer5.json"))

CLS_ID = tokenizer.token_to_id("[CLS]")
SEP_ID = tokenizer.token_to_id("[SEP]")
PAD_ID = tokenizer.token_to_id("[PAD]")
print(f"Special tokens → [CLS]: {CLS_ID}, [SEP]: {SEP_ID}, [PAD]: {PAD_ID}")   # ← Fixed!

# ====================== 3. CORRECT ENCODING (THIS IS THE KEY FIX) ======================
def encode_pair(text, summary):
    src = tokenizer.encode(text).ids[:512]
    tgt_ids = tokenizer.encode(summary).ids
    tgt = [CLS_ID] + tgt_ids + [SEP_ID]          # BOS + summary + EOS
    tgt = tgt[:256]
    src += [PAD_ID] * (512 - len(src))
    tgt += [PAD_ID] * (256 - len(tgt))
    return src, tgt

inputs, targets = zip(*[encode_pair(t, s) for t, s in zip(df['text'], df['summary'])])
inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

# Train/val split
train_size = int(0.8 * len(inputs))
train_in, val_in = inputs[:train_size], inputs[train_size:]
train_tgt, val_tgt = targets[:train_size], targets[train_size:]

# ====================== 4. Model ======================
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        d_model = 512
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=8,
                                          num_encoder_layers=6, num_decoder_layers=6,
                                          dim_feedforward=2048, dropout=0.1,
                                          batch_first=True)
        self.out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, tgt_mask=None):
        src = self.embed(src) * math.sqrt(self.d_model)
        tgt = self.embed(tgt) * math.sqrt(self.d_model)
        src = self.pos(src)
        tgt = self.pos(tgt)
        return self.out(self.transformer(src, tgt, tgt_mask=tgt_mask))

    def generate_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# ====================== 5. Training ======================
model = TransformerModel(vocab_size=tokenizer.get_vocab_size()).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
scaler = GradScaler()

print("Starting training...")
for epoch in range(1, 41):
    model.train()
    total = 0.0
    for i in range(0, len(train_in), 4):
        optimizer.zero_grad()
        src = train_in[i:i+4].cuda()
        tgt = train_tgt[i:i+4].cuda()
        tgt_in = tgt[:, :-1]
        mask = model.generate_mask(tgt_in.size(1)).cuda()

        with autocast():
            out = model(src, tgt_in, mask)
            loss = criterion(out.reshape(-1, out.size(-1)), tgt[:, 1:].reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total += loss.item()

    # Quick validation ROUGE-1
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
    r1_scores = []
    with torch.no_grad():
        for i in range(0, len(val_in), 4):
            src = val_in[i:i+4].cuda()
            tgt = val_tgt[i:i+4].cuda()
            tgt_in = tgt[:, :-1]
            out = model(src, tgt_in, model.generate_mask(tgt_in.size(1)).cuda())
            pred = out.argmax(-1)
            for p, t in zip(pred, tgt):
                ref = tokenizer.decode(t[1:].tolist(), skip_special_tokens=True)
                gen = tokenizer.decode(p.tolist(), skip_special_tokens=True)
                r1_scores.append(scorer.score(ref, gen)['rouge1'].fmeasure)

    rouge1 = sum(r1_scores) / len(r1_scores) if r1_scores else 0
    print(f"Epoch {epoch:3d} | TrainLoss {total/(len(train_in)//4):.4f} | ValROUGE-1 {rouge1:.4f}")

# ====================== 6. Save ======================
torch.save(model.state_dict(), os.path.join(save_dir, "model5.pt"))
print("Training finished! model.pt and tokenizer.json saved to /content/drive/MyDrive/Transformer")

Loaded 4966 samples
Special tokens → [CLS]: 2, [SEP]: 3, [PAD]: 0


  scaler = GradScaler()
  with autocast():


Starting training...
Epoch   1 | TrainLoss 7.2776 | ValROUGE-1 0.0353
Epoch   2 | TrainLoss 6.8599 | ValROUGE-1 0.0000
Epoch   3 | TrainLoss 6.8630 | ValROUGE-1 0.0768
Epoch   4 | TrainLoss 6.8787 | ValROUGE-1 0.0000
Epoch   5 | TrainLoss 6.9434 | ValROUGE-1 0.0000
Epoch   6 | TrainLoss 7.0075 | ValROUGE-1 0.0000
Epoch   7 | TrainLoss 6.9683 | ValROUGE-1 0.0000
Epoch   8 | TrainLoss 6.9790 | ValROUGE-1 0.0000
Epoch   9 | TrainLoss 6.9446 | ValROUGE-1 0.0000
Epoch  10 | TrainLoss 6.9363 | ValROUGE-1 0.0000
Epoch  11 | TrainLoss 6.9261 | ValROUGE-1 0.0000
Epoch  12 | TrainLoss 6.9265 | ValROUGE-1 0.0000
Epoch  13 | TrainLoss 6.9385 | ValROUGE-1 0.0000
Epoch  14 | TrainLoss 7.0194 | ValROUGE-1 0.0000
Epoch  15 | TrainLoss 7.1193 | ValROUGE-1 0.0000
Epoch  16 | TrainLoss 7.1418 | ValROUGE-1 0.0000
Epoch  17 | TrainLoss 7.0998 | ValROUGE-1 0.0000
Epoch  18 | TrainLoss 7.0870 | ValROUGE-1 0.0000
Epoch  19 | TrainLoss 7.1111 | ValROUGE-1 0.0000
Epoch  20 | TrainLoss 7.0882 | ValROUGE-1 0.0000

# Test 6

In [None]:
# training_colab.py — Colab-ready
import os
import math
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from rouge_score import rouge_scorer

# ========== CONFIG ==========
DATA_CSV = "/content/drive/MyDrive/Transformer/mtsamples.csv"   # update if needed
SAVE_DIR = "/content/drive/MyDrive/Transformer"
TOKENIZER_NAME = "tokenizer6.json"
MODEL_NAME = "model6.pt"

VOCAB_SIZE = 30000
MAX_SRC_LEN = 512
MAX_TGT_LEN = 256
BATCH_SIZE = 4
NUM_EPOCHS = 100
LR = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(SAVE_DIR, exist_ok=True)
print(f"[config] DEVICE={DEVICE} SAVE_DIR={SAVE_DIR}")

# ========== 1. Load data ==========
if not os.path.exists(DATA_CSV):
    raise FileNotFoundError(f"Data CSV not found at {DATA_CSV}. Upload to Drive or change DATA_CSV.")
df = pd.read_csv(DATA_CSV)
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})
print(f"[data] loaded {len(df)} rows. Examples:")
print(df.head(2).to_dict(orient='records'))

# ========== 2. Train tokenizer (WordPiece) ==========
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

print("[tokenizer] training... (this may take a while)")
# Use generator to avoid building a giant intermediate list in memory
def iter_texts():
    for t,s in zip(df['text'].astype(str), df['summary'].astype(str)):
        yield t
        yield s

tokenizer.train_from_iterator(iter_texts(), trainer=trainer)
tokenizer_path = os.path.join(SAVE_DIR, TOKENIZER_NAME)
tokenizer.save(tokenizer_path)
print(f"[tokenizer] saved to {tokenizer_path}")

# Check special token ids exist and are valid ints
CLS_ID = tokenizer.token_to_id("[CLS]")
SEP_ID = tokenizer.token_to_id("[SEP]")
PAD_ID = tokenizer.token_to_id("[PAD]")
UNK_ID = tokenizer.token_to_id("[UNK]")

if any(x is None for x in (CLS_ID, SEP_ID, PAD_ID, UNK_ID)):
    print("ERROR: One or more special tokens missing after training. Tokenizer vocab size:", tokenizer.get_vocab_size())
    print("Token ids:", {"CLS":CLS_ID,"SEP":SEP_ID,"PAD":PAD_ID,"UNK":UNK_ID})
    raise SystemExit("Special tokens not present. Re-run tokenizer training with special_tokens configured.")

print(f"[tokenizer] ids -> CLS:{CLS_ID} SEP:{SEP_ID} PAD:{PAD_ID} UNK:{UNK_ID}  vocab_size={tokenizer.get_vocab_size()}")

# Quick tokenization sanity-check on a small example
sample_text = df['text'].iloc[0]
sample_summary = df['summary'].iloc[0]
print("[sanity] sample text (truncated):", sample_text[:200])
print("[sanity] sample summary (truncated):", sample_summary[:200])
enc_src = tokenizer.encode(sample_text).ids[:50]
enc_tgt = tokenizer.encode(sample_summary).ids[:50]
print("[sanity] encoded src ids (first 50):", enc_src[:50])
print("[sanity] encoded tgt ids (first 50):", enc_tgt[:50])
# Decoding the encoded tokens back
print("[sanity] decode src:", tokenizer.decode(enc_src, skip_special_tokens=True)[:200])
print("[sanity] decode tgt:", tokenizer.decode(enc_tgt, skip_special_tokens=True)[:200])

# ========== 3. Encoding helper ==========
def encode_pair(text: str, summary: str):
    src_ids = tokenizer.encode(str(text)).ids[:MAX_SRC_LEN]
    tgt_ids = tokenizer.encode(str(summary)).ids
    tgt = [CLS_ID] + tgt_ids + [SEP_ID]
    src_ids = src_ids[:MAX_SRC_LEN]
    tgt = tgt[:MAX_TGT_LEN]
    src_ids += [PAD_ID] * (MAX_SRC_LEN - len(src_ids))
    tgt += [PAD_ID] * (MAX_TGT_LEN - len(tgt))
    return src_ids, tgt

print("[encode] building tensors (this may take a moment)...")
pairs = [encode_pair(t, s) for t, s in zip(df['text'], df['summary'])]
srcs, tgts = zip(*pairs)
src_tensor = torch.tensor(srcs, dtype=torch.long)
tgt_tensor = torch.tensor(tgts, dtype=torch.long)
print("[encode] done. shapes:", src_tensor.shape, tgt_tensor.shape)

# train/val split
train_size = int(0.8 * len(src_tensor))
train_src, val_src = src_tensor[:train_size], src_tensor[train_size:]
train_tgt, val_tgt = tgt_tensor[:train_size], tgt_tensor[train_size:]
print(f"[split] train {len(train_src)} val {len(val_src)}")

# ========== 4. Model ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8,
                 num_encoder_layers=6, num_decoder_layers=6,
                 dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
        self.pos = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout,
                                          batch_first=True)
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt, tgt_mask=None):
        src_emb = self.embed(src) * math.sqrt(self.d_model)
        tgt_emb = self.embed(tgt) * math.sqrt(self.d_model)
        src_emb = self.pos(src_emb)
        tgt_emb = self.pos(tgt_emb)
        out = self.transformer(src_emb, tgt_emb, tgt_mask=tgt_mask)
        logits = self.out(out)
        return logits

    @staticmethod
    def generate_mask(sz):
        return torch.triu(torch.ones(sz, sz, device=DEVICE) * float('-inf'), diagonal=1)

# instantiate and move to device
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).to(DEVICE)
print(f"[model] vocab_size={vocab_size} device={DEVICE}")

# ========== 5. Training utilities ==========
optimizer = optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
scaler = GradScaler()

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, srcs, tgts):
        self.srcs = srcs
        self.tgts = tgts
    def __len__(self):
        return len(self.srcs)
    def __getitem__(self, idx):
        return {'src': self.srcs[idx], 'tgt': self.tgts[idx]}

train_ds = SimpleDataset(train_src, train_tgt)
val_ds = SimpleDataset(val_src, val_tgt)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=2)

# ========== 6. Training loop ==========
print("[train] starting training...")
start_time = time.time()
for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    train_loss = 0.0
    num_batches = 0
    for batch in train_loader:
        optimizer.zero_grad()
        src = batch['src'].to(DEVICE)
        tgt = batch['tgt'].to(DEVICE)
        tgt_in = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        tgt_mask = model.generate_mask(tgt_in.size(1)).to(DEVICE)

        with autocast():
            logits = model(src, tgt_in, tgt_mask=tgt_mask)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        num_batches += 1

    avg_train_loss = train_loss / max(1, num_batches)

    # quick validation - compute ROUGE-1 on greedy pred
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    r1_scores = []
    with torch.no_grad():
        for batch in val_loader:
            src = batch['src'].to(DEVICE)
            tgt = batch['tgt'].to(DEVICE)
            tgt_in = tgt[:, :-1]
            tgt_mask = model.generate_mask(tgt_in.size(1)).to(DEVICE)
            logits = model(src, tgt_in, tgt_mask=tgt_mask)
            preds = logits.argmax(dim=-1)

            for p, t in zip(preds, tgt):
                ref_ids = t[1:].tolist()
                if SEP_ID in ref_ids:
                    ref_ids = ref_ids[:ref_ids.index(SEP_ID)]
                ref_ids = [x for x in ref_ids if x != PAD_ID]
                gen_ids = p.tolist()
                if SEP_ID in gen_ids:
                    gen_ids = gen_ids[:gen_ids.index(SEP_ID)]
                # decode
                ref_text = tokenizer.decode(ref_ids, skip_special_tokens=True).strip()
                gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
                if ref_text and gen_text:
                    r1_scores.append(scorer.score(ref_text, gen_text)['rouge1'].fmeasure)

    val_rouge1 = sum(r1_scores) / len(r1_scores) if r1_scores else 0.0
    elapsed = time.time() - start_time
    print(f"Epoch {epoch:3d} | TrainLoss {avg_train_loss:.4f} | ValROUGE-1 {val_rouge1:.4f} | elapsed {elapsed/60:.1f} min")

# ========== 7. Save ==========
torch.save(model.state_dict(), os.path.join(SAVE_DIR, MODEL_NAME))
print(f"[save] model saved to {os.path.join(SAVE_DIR, MODEL_NAME)}")
print("Training finished.")

[config] DEVICE=cuda SAVE_DIR=/content/drive/MyDrive/Transformer
[data] loaded 4966 rows. Examples:
[{'text': 'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa wa

  scaler = GradScaler()
  with autocast():


Epoch   1 | TrainLoss 7.1215 | ValROUGE-1 0.1288 | elapsed 1.8 min
Epoch   2 | TrainLoss 6.3297 | ValROUGE-1 0.1706 | elapsed 3.7 min
Epoch   3 | TrainLoss 5.8160 | ValROUGE-1 0.2025 | elapsed 5.6 min
Epoch   4 | TrainLoss 5.3713 | ValROUGE-1 0.2335 | elapsed 7.4 min
Epoch   5 | TrainLoss 4.9465 | ValROUGE-1 0.2794 | elapsed 9.3 min
Epoch   6 | TrainLoss 4.5680 | ValROUGE-1 0.3116 | elapsed 11.1 min
Epoch   7 | TrainLoss 4.2314 | ValROUGE-1 0.3416 | elapsed 13.0 min
Epoch   8 | TrainLoss 3.9272 | ValROUGE-1 0.3693 | elapsed 14.9 min
Epoch   9 | TrainLoss 3.6620 | ValROUGE-1 0.4061 | elapsed 16.7 min
Epoch  10 | TrainLoss 3.4233 | ValROUGE-1 0.4644 | elapsed 18.6 min
Epoch  11 | TrainLoss 3.2140 | ValROUGE-1 0.4633 | elapsed 20.4 min
Epoch  12 | TrainLoss 3.0325 | ValROUGE-1 0.5122 | elapsed 22.3 min
Epoch  13 | TrainLoss 2.8743 | ValROUGE-1 0.5357 | elapsed 24.1 min
Epoch  14 | TrainLoss 2.7545 | ValROUGE-1 0.5632 | elapsed 26.0 min
Epoch  15 | TrainLoss 2.6594 | ValROUGE-1 0.5877 | el

# Trial 7

In [None]:
# train_transformer_fixed_decode.py (Colab-ready cell)
# Requirements: tokenizers, torch, rouge_score, pandas
import os
import math
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from rouge_score import rouge_scorer

# ========== CONFIG ==========
DATA_CSV = "/content/drive/MyDrive/Transformer/mtsamples.csv"   # change if needed
SAVE_DIR = "/content/drive/MyDrive/Transformer"
TOKENIZER_NAME = "tokenizer7.json"
MODEL_NAME = "model7.pt"

VOCAB_SIZE = 30000
MAX_SRC_LEN = 512
MAX_TGT_LEN = 256
BATCH_SIZE = 4
NUM_EPOCHS = 20    # set low to test; increase later
LR = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(SAVE_DIR, exist_ok=True)

# ========== 1. Load data ==========
if not os.path.exists(DATA_CSV):
    raise FileNotFoundError(f"Data CSV not found at {DATA_CSV}. Upload to Drive or change DATA_CSV.")
df = pd.read_csv(DATA_CSV)
df = df[['transcription', 'description']].dropna().rename(columns={'transcription':'text','description':'summary'})
print(f"[data] {len(df)} rows. sample:")
print(df.head(2).to_dict(orient='records'))

# ========== 2. Train tokenizer (WordPiece) ==========
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

print("[tokenizer] training (may take a while)...")
def iter_texts():
    for t,s in zip(df['text'].astype(str), df['summary'].astype(str)):
        yield t
        yield s
tokenizer.train_from_iterator(iter_texts(), trainer=trainer)
tokenizer_path = os.path.join(SAVE_DIR, TOKENIZER_NAME)
tokenizer.save(tokenizer_path)
print(f"[tokenizer] saved to {tokenizer_path}")

CLS_ID = tokenizer.token_to_id("[CLS]")
SEP_ID = tokenizer.token_to_id("[SEP]")
PAD_ID = tokenizer.token_to_id("[PAD]")
UNK_ID = tokenizer.token_to_id("[UNK]")
print(f"[tokenizer ids] CLS:{CLS_ID} SEP:{SEP_ID} PAD:{PAD_ID} UNK:{UNK_ID}  vocab_size={tokenizer.get_vocab_size()}")
assert None not in (CLS_ID, SEP_ID, PAD_ID, UNK_ID), "Missing special token — retrain with special_tokens set."

# ========== 3. Encode helper ==========
def encode_pair(text: str, summary: str):
    src_ids = tokenizer.encode(str(text)).ids[:MAX_SRC_LEN]
    tgt_ids = tokenizer.encode(str(summary)).ids
    tgt = [CLS_ID] + tgt_ids + [SEP_ID]
    src_ids = src_ids[:MAX_SRC_LEN]
    tgt = tgt[:MAX_TGT_LEN]
    src_ids += [PAD_ID] * (MAX_SRC_LEN - len(src_ids))
    tgt += [PAD_ID] * (MAX_TGT_LEN - len(tgt))
    return src_ids, tgt

print("[encode] encoding dataset...")
pairs = [encode_pair(t, s) for t, s in zip(df['text'], df['summary'])]
srcs, tgts = zip(*pairs)
src_tensor = torch.tensor(srcs, dtype=torch.long)
tgt_tensor = torch.tensor(tgts, dtype=torch.long)
print("[encode] shapes:", src_tensor.shape, tgt_tensor.shape)

# split
train_size = int(0.8 * len(src_tensor))
train_src, val_src = src_tensor[:train_size], src_tensor[train_size:]
train_tgt, val_tgt = tgt_tensor[:train_size], tgt_tensor[train_size:]
print(f"[split] train {len(train_src)} val {len(val_src)}")

# ========== 4. Model (with proper mask support) ==========
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8,
                 num_encoder_layers=6, num_decoder_layers=6,
                 dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
        self.pos = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout,
                                          batch_first=True)
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt, tgt_mask=None,
                src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        # embed + positional
        src_emb = self.embed(src) * math.sqrt(self.d_model)
        tgt_emb = self.embed(tgt) * math.sqrt(self.d_model)
        src_emb = self.pos(src_emb)
        tgt_emb = self.pos(tgt_emb)
        out = self.transformer(src_emb, tgt_emb,
                               tgt_mask=tgt_mask,
                               src_key_padding_mask=src_key_padding_mask,
                               tgt_key_padding_mask=tgt_key_padding_mask,
                               memory_key_padding_mask=memory_key_padding_mask)
        logits = self.out(out)
        return logits

    @staticmethod
    def subsequent_mask(sz, device):
        return torch.triu(torch.ones(sz, sz, device=device) * float('-inf'), diagonal=1)

# instantiate
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).to(DEVICE)
print(f"[model] vocab_size={vocab_size} device={DEVICE}")

# ========== 5. Training utilities ==========
optimizer = optim.Adam(model.parameters(), lr=LR, betas=(0.9,0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID, label_smoothing=0.1)
scaler = GradScaler()

class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, srcs, tgts):
        self.srcs = srcs
        self.tgts = tgts
    def __len__(self):
        return len(self.srcs)
    def __getitem__(self, idx):
        return {'src': self.srcs[idx], 'tgt': self.tgts[idx]}

train_ds = SimpleDataset(train_src, train_tgt)
val_ds = SimpleDataset(val_src, val_tgt)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE)

# ========== 6. Greedy decode function (used in validation) ==========
def greedy_decode(model, src_tensor, tokenizer, max_len=MAX_TGT_LEN, device=DEVICE):
    # src_tensor: (B, S)
    model.eval()
    batch = src_tensor.size(0)
    results = []
    src_key_padding = (src_tensor == PAD_ID)  # (B,S)
    with torch.no_grad():
        for i in range(batch):
            src = src_tensor[i:i+1].to(device)
            src_key_pad = src_key_padding[i:i+1].to(device)
            generated = [CLS_ID]
            for _ in range(max_len - 1):
                # build tgt full padded sequence
                tgt_full = generated + [PAD_ID] * (max_len - len(generated))
                tgt_tensor = torch.tensor([tgt_full], dtype=torch.long, device=device)
                tgt_key_pad = (tgt_tensor == PAD_ID)  # (1, max_len)
                tgt_mask = TransformerModel.subsequent_mask(max_len, device=device)
                logits = model(src, tgt_tensor,
                               tgt_mask=tgt_mask,
                               src_key_padding_mask=src_key_pad,
                               tgt_key_padding_mask=tgt_key_pad,
                               memory_key_padding_mask=src_key_pad)
                # pick logit at last generated position
                pos = len(generated) - 1
                next_token = int(logits[0, pos].argmax().item())
                generated.append(next_token)
                if next_token == SEP_ID:
                    break
                if len(generated) >= max_len:
                    break
            # trim at SEP
            if SEP_ID in generated:
                generated = generated[:generated.index(SEP_ID)+1]
            results.append(generated)
    # decode each — NOTE: do NOT pass unsupported keyword args
    decoded = [tokenizer.decode(r, skip_special_tokens=True).strip() for r in results]
    return decoded

# ========== 7. Training loop ==========
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
print("[train] starting...")
start = time.time()
for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    running_loss = 0.0
    batches = 0
    for batch in train_loader:
        optimizer.zero_grad()
        src = batch['src'].to(DEVICE)
        tgt = batch['tgt'].to(DEVICE)
        tgt_in = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        src_key_pad = (src == PAD_ID)
        tgt_key_pad = (tgt_in == PAD_ID)
        tgt_mask = TransformerModel.subsequent_mask(tgt_in.size(1), device=DEVICE)

        with autocast():
            logits = model(src, tgt_in,
                           tgt_mask=tgt_mask,
                           src_key_padding_mask=src_key_pad,
                           tgt_key_padding_mask=tgt_key_pad,
                           memory_key_padding_mask=src_key_pad)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        batches += 1

    avg_loss = running_loss / max(1, batches)

    # validation using greedy decode
    model.eval()
    r1s = []
    with torch.no_grad():
        for batch in val_loader:
            src = batch['src'].to(DEVICE)
            tgt = batch['tgt'].to(DEVICE)
            preds = greedy_decode(model, src, tokenizer, max_len=MAX_TGT_LEN, device=DEVICE)  # list of strings
            # get references
            for p_str, t in zip(preds, tgt):
                # build reference string (drop starting CLS and anything after SEP)
                ref_ids = t[1:].tolist()
                if SEP_ID in ref_ids:
                    ref_ids = ref_ids[:ref_ids.index(SEP_ID)]
                ref_ids = [x for x in ref_ids if x != PAD_ID]
                # decode (no unsupported kwargs)
                ref_text = tokenizer.decode(ref_ids, skip_special_tokens=True).strip()
                gen_text = p_str.strip()
                if ref_text and gen_text:
                    r1s.append(scorer.score(ref_text, gen_text)['rouge1'].fmeasure)

    val_rouge1 = sum(r1s)/len(r1s) if r1s else 0.0
    elapsed = (time.time() - start) / 60.0
    print(f"Epoch {epoch:3d} | TrainLoss {avg_loss:.4f} | ValROUGE-1 {val_rouge1:.4f} | elapsed {elapsed:.1f} min")

# ========== 8. Save ==========
torch.save(model.state_dict(), os.path.join(SAVE_DIR, MODEL_NAME))
print(f"[save] model saved to {os.path.join(SAVE_DIR, MODEL_NAME)}")
print("Training finished.")

[data] 4966 rows. sample:
[{'text': 'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear

  scaler = GradScaler()
  with autocast():


Epoch   1 | TrainLoss 7.2001 | ValROUGE-1 0.0660 | elapsed 69.5 min


KeyboardInterrupt: 

# Test 8

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer

# --- Configuration (MUST MATCH Frontend) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
D_MODEL = 512
DIM_FEEDFORWARD = 2048
N_LAYERS = 4
N_HEADS = 8
BATCH_SIZE = 8
NUM_EPOCHS = 50
WARMUP_STEPS = 2000 # Critical for Transformer stability
GRADIENT_CLIP_NORM = 1.0
# -------------------------------------------

# --- Step 1 & 2: Data Preparation (Keeping data prep logic consistent) ---
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})

texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=30000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train_from_iterator(texts, trainer=trainer)

tokenizer_path = "/content/drive/MyDrive/Transformer"
os.makedirs(tokenizer_path, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_path, "tokenizer8.json")) # Updated filename

def encode_texts(texts, summaries, max_input_length=512, max_target_length=128):
    input_encodings = []
    target_encodings = []
    cls_token = tokenizer.token_to_id("[CLS]")
    sep_token = tokenizer.token_to_id("[SEP]")

    for text, summary in zip(texts, summaries):
        # Input: [CLS] + Text tokens
        input_ids = [cls_token] + tokenizer.encode(text).ids
        # Target: [CLS] + Summary tokens
        target_ids = [cls_token] + tokenizer.encode(summary).ids

        # Apply truncation and add [SEP]
        if len(input_ids) > max_input_length - 1:
            input_ids = input_ids[:max_input_length-1]
        input_ids += [sep_token]

        if len(target_ids) > max_target_length - 1:
            target_ids = target_ids[:max_target_length-1]
        target_ids += [sep_token]

        # Final safety truncation
        input_encodings.append(input_ids[:max_input_length])
        target_encodings.append(target_ids[:max_target_length])

    return input_encodings, target_encodings

input_encodings, target_encodings = encode_texts(df['text'], df['summary'])

def pad_sequences(sequences, max_length, pad_token_id):
    padded = []
    for seq in sequences:
        if len(seq) < max_length:
            seq = seq + [pad_token_id] * (max_length - len(seq))
        padded.append(seq[:max_length])
    return padded

max_input_length = 512
max_target_length = 128
pad_token_id = tokenizer.token_to_id("[PAD]")
input_encodings = pad_sequences(input_encodings, max_input_length, pad_token_id)
target_encodings = pad_sequences(target_encodings, max_target_length, pad_token_id)

input_tensors = torch.tensor(input_encodings, dtype=torch.long)
target_tensors = torch.tensor(target_encodings, dtype=torch.long)

train_size = int(0.8 * len(input_tensors))
train_inputs, val_inputs = input_tensors[:train_size], input_tensors[train_size:]
train_targets, val_targets = target_tensors[:train_size], target_tensors[train_size:]

# --- Step 3: Define Transformer model (Keeping architecture) ---

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=D_MODEL, nhead=N_HEADS, num_encoder_layers=N_LAYERS, num_decoder_layers=N_LAYERS, dim_feedforward=DIM_FEEDFORWARD, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        output = self.transformer(
            src,
            tgt,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask.to(device)

# --- NEW: Transformer Learning Rate Scheduler ---
class NoamOpt:
    def __init__(self, model_size, warmup_steps, optimizer):
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self._step = 0
        self._rate = 0

    def step(self):
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        if step is None:
            step = self._step
        if step == 0:
            return 1e-6

        # Formula: d_model^(-0.5) * min(step^(-0.5), step * warmup_steps^(-1.5))
        return self.model_size**(-0.5) * min(step**(-0.5), step * self.warmup_steps**(-1.5))

    def zero_grad(self):
        self.optimizer.zero_grad()


# Instantiate model
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).to(device)

# --- Step 4: Training setup (Modified for NoamOpt) ---
base_optimizer = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
optimizer = NoamOpt(D_MODEL, WARMUP_STEPS, base_optimizer) # Use NoamOpt wrapper

criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
scaler = GradScaler()

class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.targets[idx]}

train_dataset = MedicalDataset(train_inputs, train_targets)
val_dataset = MedicalDataset(val_inputs, val_targets)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

# --- Step 5: Training loop (Modified for NoamOpt & Gradient Clipping) ---
def train_epoch(model, loader, optimizer, criterion, scaler):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        src = batch['input_ids'].to(device)
        tgt = batch['labels'].to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        src_padding_mask = (src == pad_token_id).to(device)
        tgt_padding_mask = (tgt_input == pad_token_id).to(device)

        with autocast():
            output = model(
                src,
                tgt_input,
                tgt_mask=tgt_mask,
                src_key_padding_mask=src_padding_mask,
                tgt_key_padding_mask=tgt_padding_mask
            )
            loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))

        # 1. Backprop and scaling
        scaler.scale(loss).backward()

        # 2. Unscale, Clip, Step (Crucial sequence for stability)
        scaler.unscale_(optimizer.optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=GRADIENT_CLIP_NORM) # Apply clipping

        scaler.step(optimizer.optimizer)
        scaler.update()
        optimizer.step() # Update learning rate using Noam schedule

        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation remains the same (without Noam step)
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    # ... (evaluation logic, same as before)
    # The evaluation logic is extensive, refer to the previous provided 'evaluate' function
    # Ensure you are also using the padding masks in evaluate()
    with torch.no_grad():
        for batch in loader:
            # ... (load data, create masks)
            src = batch['input_ids'].to(device)
            tgt = batch['labels'].to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            src_padding_mask = (src == pad_token_id).to(device)
            tgt_padding_mask = (tgt_input == pad_token_id).to(device)

            with autocast():
                output = model(
                    src,
                    tgt_input,
                    tgt_mask=tgt_mask,
                    src_key_padding_mask=src_padding_mask,
                    tgt_key_padding_mask=tgt_padding_mask
                )
                loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
            total_loss += loss.item()
            # ... (rest of ROUGE logic)

    return total_loss / len(loader), None, None # Returning only loss for simplicity

# --- Step 6: Train the model ---
print(f"Starting training on device: {device}")
for epoch in range(NUM_EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler)
    current_lr = optimizer._rate
    val_loss, _, _ = evaluate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, LR: {current_lr:.6e}")


# --- Step 7: Save the model ---
model_path = "/content/drive/MyDrive/Transformer"
os.makedirs(model_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(model_path, "model8.pt")) # Updated filename
print(f"Model saved to: {model_path}")

  scaler = GradScaler()
  with autocast():


Starting training on device: cuda


  with autocast():


Epoch 1/50, Train Loss: nan, Val Loss: nan, LR: 2.455706e-04
Epoch 2/50, Train Loss: nan, Val Loss: nan, LR: 4.911412e-04
Epoch 3/50, Train Loss: nan, Val Loss: nan, LR: 7.367119e-04
Epoch 4/50, Train Loss: nan, Val Loss: nan, LR: 9.822825e-04
Epoch 5/50, Train Loss: nan, Val Loss: nan, LR: 8.865471e-04
Epoch 6/50, Train Loss: nan, Val Loss: nan, LR: 8.093031e-04
Epoch 7/50, Train Loss: nan, Val Loss: nan, LR: 7.492691e-04
Epoch 8/50, Train Loss: nan, Val Loss: nan, LR: 7.008770e-04
Epoch 9/50, Train Loss: nan, Val Loss: nan, LR: 6.607932e-04
Epoch 10/50, Train Loss: nan, Val Loss: nan, LR: 6.268835e-04
Epoch 11/50, Train Loss: nan, Val Loss: nan, LR: 5.977099e-04
Epoch 12/50, Train Loss: nan, Val Loss: nan, LR: 5.722637e-04
Epoch 13/50, Train Loss: nan, Val Loss: nan, LR: 5.498132e-04
Epoch 14/50, Train Loss: nan, Val Loss: nan, LR: 5.298132e-04
Epoch 15/50, Train Loss: nan, Val Loss: nan, LR: 5.118482e-04
Epoch 16/50, Train Loss: nan, Val Loss: nan, LR: 4.955949e-04
Epoch 17/50, Trai

# Testing frontend in colab for gpu

In [3]:
# Install required libraries
!pip install streamlit torch tokenizers pandas pyngrok --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
!unzip ngrok-v3-stable-linux-amd64.zip
!mv ngrok /usr/local/bin/

--2025-11-21 05:25:41--  https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 13.248.244.96, 35.71.179.82, 75.2.60.68, ...
Connecting to bin.equinox.io (bin.equinox.io)|13.248.244.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9378086 (8.9M) [application/octet-stream]
Saving to: ‘ngrok-v3-stable-linux-amd64.zip’


2025-11-21 05:25:42 (61.5 MB/s) - ‘ngrok-v3-stable-linux-amd64.zip’ saved [9378086/9378086]

Archive:  ngrok-v3-stable-linux-amd64.zip
  inflating: ngrok                   


In [7]:
import re
from pyngrok import ngrok

# --- Configuration ---
# NOTE: The file names MUST match the files you uploaded in the previous step.
COLAB_TOKENIZER_PATH = "/content/drive/MyDrive/Transformer/tokenizer2.json"
COLAB_MODEL_PATH = "/content/drive/MyDrive/Transformer/model2.pt"
STREAMLIT_APP_FILE = "/content/drive/MyDrive/Transformer/app2.py"

# Replace with your actual ngrok token
NGROK_TOKEN = "35kJKjks8eW0SstgS7ceeKcJDkY_2bfXGWWyKq4AFGRvYAcgq"
ngrok.set_auth_token(NGROK_TOKEN)

# 1. Read and update the original Streamlit code
with open(STREAMLIT_APP_FILE, "r") as f:
    code = f.read()

# Use regex to replace the old Windows paths with the Colab paths
code = re.sub(r'tokenizer_path\s*=\s*".*?"', f'tokenizer_path = "{COLAB_TOKENIZER_PATH}"', code, count=1)
code = re.sub(r'model_path\s*=\s*".*?"', f'model_path = "{COLAB_MODEL_PATH}"', code, count=1)

# 2. Write the modified code to a new temporary file
COLAB_APP_FILE = "app2_colab.py"
with open(COLAB_APP_FILE, "w") as f:
    f.write(code)

print(f"✅ Paths in {STREAMLIT_APP_FILE} updated to use Colab paths.")
print(f"✅ ngrok token set up.")

✅ Paths in /content/drive/MyDrive/Transformer/app2.py updated to use Colab paths.
✅ ngrok token set up.


In [9]:
import threading
import time
from pyngrok import ngrok

# Define the port Streamlit will run on
PORT = 8501
COLAB_APP_FILE = "app2_colab.py" # The file generated in the previous step

# Function to run the Streamlit app
def run_streamlit():
    # Run the modified app using Streamlit
    !streamlit run $COLAB_APP_FILE --server.port $PORT --server.enableCORS false --server.enableXsrfProtection false

# Start Streamlit in a separate thread
print("⚙️ Starting Streamlit server...")
threading.Thread(target=run_streamlit, daemon=True).start()

# Wait a few seconds for Streamlit to start
time.sleep(8)

# Start ngrok tunnel
public_url = ngrok.connect(PORT)
print(f"\n\n🎉 Your Streamlit App is now public at the URL below. Click the link to open it:\n")
print(f"🔗 Public URL: {public_url}\n\n")

# To keep the script running and the tunnel open, the cell must continue to run.
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    ngrok.kill()
    print("Tunnel closed.")

⚙️ Starting Streamlit server...

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.138.173.130:8501[0m
[0m


🎉 Your Streamlit App is now public at the URL below. Click the link to open it:

🔗 Public URL: NgrokTunnel: "https://a6be12c2b333.ngrok-free.app" -> "http://localhost:8501"


Tunnel closed.


# Test 9

In [14]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer
import re

# Define model path at the beginning
model_path = "/content/drive/MyDrive/Transformer"
os.makedirs(model_path, exist_ok=True)

# Step 1: Load and prepare the dataset
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()

# Clean and filter data
def clean_text(text):
    # Remove special characters and extra whitespace
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\;\:]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['transcription'].apply(clean_text)
df['summary'] = df['description'].apply(clean_text)

# Filter out very short entries
df = df[(df['text'].str.len() > 50) & (df['summary'].str.len() > 10)]

df = df.rename(columns={'transcription': 'text', 'description': 'summary'})

# Step 2: Train a WordPiece tokenizer
texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(
    vocab_size=30000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    min_frequency=2  # Only include tokens appearing at least twice
)
tokenizer.train_from_iterator(texts, trainer=trainer)

# Save tokenizer
tokenizer.save(os.path.join(model_path, "tokenizer9.json"))

# Encode dataset
def encode_texts(texts, summaries, max_input_length=512, max_target_length=128):
    input_encodings = []
    target_encodings = []
    for text, summary in zip(texts, summaries):
        input_ids = tokenizer.encode(text).ids
        target_ids = tokenizer.encode(summary).ids
        # Add start and end tokens
        input_ids = [tokenizer.token_to_id("[CLS]")] + input_ids + [tokenizer.token_to_id("[SEP]")]
        target_ids = [tokenizer.token_to_id("[CLS]")] + target_ids + [tokenizer.token_to_id("[SEP]")]

        if len(input_ids) > max_input_length:
            input_ids = input_ids[:max_input_length]
        if len(target_ids) > max_target_length:
            target_ids = target_ids[:max_target_length]
        input_encodings.append(input_ids)
        target_encodings.append(target_ids)
    return input_encodings, target_encodings

input_encodings, target_encodings = encode_texts(df['text'], df['summary'])

# Pad sequences
def pad_sequences(sequences, max_length, pad_token_id):
    padded = []
    for seq in sequences:
        if len(seq) < max_length:
            seq = seq + [pad_token_id] * (max_length - len(seq))
        padded.append(seq[:max_length])
    return padded

max_input_length = 512
max_target_length = 128
pad_token_id = tokenizer.token_to_id("[PAD]")
input_encodings = pad_sequences(input_encodings, max_input_length, pad_token_id)
target_encodings = pad_sequences(target_encodings, max_target_length, pad_token_id)

# Convert to tensors
input_tensors = torch.tensor(input_encodings, dtype=torch.long)
target_tensors = torch.tensor(target_encodings, dtype=torch.long)

# Split into train and validation (80-20)
train_size = int(0.8 * len(input_tensors))
train_inputs, val_inputs = input_tensors[:train_size], input_tensors[train_size:]
train_targets, val_targets = target_tensors[:train_size], target_tensors[train_size:]

# Step 3: Define Enhanced Transformer model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=16, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(
            src, tgt,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask
        )
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Instantiate model
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(
    vocab_size=vocab_size,
    d_model=512,
    nhead=16,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1
).cuda()

# Step 4: Training setup
optimizer = optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-7)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
scaler = GradScaler()

# Create dataset and dataloader
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'labels': self.targets[idx],
            'src_padding_mask': (self.inputs[idx] == pad_token_id)
        }

train_dataset = MedicalDataset(train_inputs, train_targets)
val_dataset = MedicalDataset(val_inputs, val_targets)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=2)

# Step 5: Training loop
def train_epoch(model, loader, optimizer, criterion, scaler, scheduler):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        src = batch['input_ids'].cuda()
        tgt = batch['labels'].cuda()
        src_padding_mask = batch['src_padding_mask'].cuda()

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

        with autocast():
            output = model(src, tgt_input, src_key_padding_mask=src_padding_mask, tgt_mask=tgt_mask)
            loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))

        scaler.scale(loss).backward()
        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        # Scheduler step AFTER optimizer step
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            src = batch['input_ids'].cuda()
            tgt = batch['labels'].cuda()
            src_padding_mask = batch['src_padding_mask'].cuda()

            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

            with autocast():
                output = model(src, tgt_input, src_key_padding_mask=src_padding_mask, tgt_mask=tgt_mask)
                loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
            total_loss += loss.item()

            # Decode for ROUGE
            preds = torch.argmax(output, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(tgt_output.cpu().numpy())
    return total_loss / len(loader), all_preds, all_labels

# Step 6: Train the model
best_val_loss = float('inf')
patience_counter = 0
patience = 5

for epoch in range(100):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, scheduler)
    val_loss, val_preds, val_labels = evaluate(model, val_loader, criterion)

    # Compute ROUGE scores
    decoded_preds = []
    decoded_labels = []
    for pred, label in zip(val_preds, val_labels):
        # Remove padding and special tokens
        pred = pred[~(pred == pad_token_id)]
        label = label[~(label == pad_token_id)]
        decoded_preds.append(tokenizer.decode(pred, skip_special_tokens=True))
        decoded_labels.append(tokenizer.decode(label, skip_special_tokens=True))

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(label, pred) for pred, label in zip(decoded_preds, decoded_labels)]
    rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, ROUGE-1: {rouge1:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), os.path.join(model_path, "best_model9.pt"))
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Load best model for final save
model.load_state_dict(torch.load(os.path.join(model_path, "best_model9.pt")))
torch.save(model.state_dict(), os.path.join(model_path, "model9.pt"))
print(f"Model saved to: {model_path}")

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1, Train Loss: 3.4551, Val Loss: 3.4668, ROUGE-1: 0.0000
Epoch 2, Train Loss: 3.4414, Val Loss: 1.2334, ROUGE-1: 0.0156
Epoch 3, Train Loss: 1.8301, Val Loss: 0.7729, ROUGE-1: 0.0156
Epoch 4, Train Loss: 0.8970, Val Loss: 0.6506, ROUGE-1: 0.0157
Epoch 5, Train Loss: 0.7224, Val Loss: 0.6506, ROUGE-1: 0.0157
Epoch 6, Train Loss: 0.6919, Val Loss: 0.6061, ROUGE-1: 0.0157
Epoch 7, Train Loss: 0.5459, Val Loss: 0.5674, ROUGE-1: 1.0000
Epoch 8, Train Loss: 0.8589, Val Loss: 0.5568, ROUGE-1: 0.0000
Epoch 9, Train Loss: 0.5967, Val Loss: 0.5409, ROUGE-1: 0.0000
Epoch 10, Train Loss: 0.5370, Val Loss: 0.4857, ROUGE-1: 0.0161
Epoch 11, Train Loss: 0.5636, Val Loss: 0.4806, ROUGE-1: 0.0157
Epoch 12, Train Loss: 0.5192, Val Loss: 0.4144, ROUGE-1: 0.0194
Epoch 13, Train Loss: 0.4296, Val Loss: 0.3743, ROUGE-1: 1.0000
Epoch 14, Train Loss: 0.4457, Val Loss: 0.3340, ROUGE-1: 0.0270
Epoch 15, Train Loss: 0.4225, Val Loss: 0.2839, ROUGE-1: 0.0317
Epoch 16, Train Loss: 0.3583, Val Loss: 0.2275, R

In [15]:
# Create the frontend file
frontend_code = '''
import streamlit as st
import torch
from tokenizers import Tokenizer
import os
import math

# Streamlit page configuration
st.set_page_config(page_title="Medical Transcript Summarizer", layout="wide")

# Load tokenizer
tokenizer_path = "/content/drive/MyDrive/Transformer/tokenizer9.json"
if not os.path.exists(tokenizer_path):
    st.error("Tokenizer file not found. Ensure tokenizer.json is in your Google Drive.")
    st.stop()
tokenizer = Tokenizer.from_file(tokenizer_path)

# Define Enhanced Transformer model
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(torch.nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=16, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = torch.nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = torch.nn.Linear(d_model, vocab_size)
        self.d_model = d_model
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(
            src, tgt,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask
        )
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Load model
model_path = "/content/drive/MyDrive/Transformer/model9.pt"
if not os.path.exists(model_path):
    st.error("Model file not found. Ensure model.pt is in your Google Drive.")
    st.stop()
vocab_size = tokenizer.get_vocab_size()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(
    vocab_size=vocab_size,
    d_model=512,
    nhead=16,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048,
    dropout=0.1
).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

# Function to generate summary
def generate_summary(text, max_length=128, temperature=0.7):
    try:
        # Encode input text
        input_ids = tokenizer.encode(text).ids
        input_ids = input_ids[:510]  # Leave space for [CLS] and [SEP]
        input_ids = [tokenizer.token_to_id("[CLS]")] + input_ids + [tokenizer.token_to_id("[SEP]")]
        input_ids = input_ids + [tokenizer.token_to_id("[PAD]")] * (512 - len(input_ids))
        input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)

        # Create source padding mask
        src_padding_mask = (input_tensor == tokenizer.token_to_id("[PAD]"))

        # Initialize target with start token
        tgt_ids = [tokenizer.token_to_id("[CLS]")]

        for _ in range(max_length):
            tgt_tensor = torch.tensor([tgt_ids], dtype=torch.long).to(device)
            tgt_mask = model.generate_square_subsequent_mask(tgt_tensor.size(1)).to(device)

            with torch.no_grad():
                output = model(input_tensor, tgt_tensor, src_key_padding_mask=src_padding_mask, tgt_mask=tgt_mask)

            # Apply temperature scaling
            next_token_logits = output[0, -1, :] / temperature
            next_token_probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(next_token_probs, num_samples=1).item()

            # Break if end token is generated
            if next_token == tokenizer.token_to_id("[SEP]"):
                break

            tgt_ids.append(next_token)

            # Stop if max length reached
            if len(tgt_ids) >= max_length:
                break

        # Decode the generated sequence
        summary = tokenizer.decode(tgt_ids, skip_special_tokens=True)
        return summary
    except Exception as e:
        return f"Error: {str(e)}"

# Streamlit UI
st.title("Medical Transcript Summarizer")
st.markdown("Enter a medical transcript below to generate a concise summary using a Transformer model.")

# Initialize session state
if "generated_summary" not in st.session_state:
    st.session_state.generated_summary = ""
if "download_data" not in st.session_state:
    st.session_state.download_data = ""

# Input section
with st.form(key="transcript_form"):
    transcript = st.text_area(
        "Medical Transcript",
        placeholder="Paste or type your medical transcript here...",
        height=200,
        key="transcript"
    )
    word_count = len(transcript.split()) if transcript.strip() else 0
    st.caption(f"Word count: {word_count}")
    submit_button = st.form_submit_button("Generate Summary")

# Output section - Fixed to remove key conflict
st.subheader("Generated Summary")
# Removed the key parameter to avoid session state conflicts
summary_output = st.text_area(
    "Summary",
    value=st.session_state.generated_summary,
    height=100,
    disabled=True
)

# Status and actions
status = st.empty()
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
    clear_button = st.button("Clear", key="clear")
with col2:
    copy_button = st.button("Copy Summary", key="copy")
with col3:
    download_button = st.download_button(
        label="Download Summary",
        data=st.session_state.download_data,
        file_name="summary.txt",
        mime="text/plain",
        key="download",
        disabled=not st.session_state.download_data
    )

# Handle form submission
if submit_button:
    if transcript.strip():
        status.info("Generating summary...")
        summary = generate_summary(transcript)
        st.session_state.generated_summary = summary
        st.session_state.download_data = summary
        status.success("Summary generated successfully!")
    else:
        status.error("Please enter a transcript.")
        st.session_state.generated_summary = ""
        st.session_state.download_data = ""

# Handle clear button
if clear_button:
    st.session_state.transcript = ""
    st.session_state.generated_summary = ""
    st.session_state.download_data = ""
    status.empty()
    st.rerun()

# Handle copy button
if copy_button and st.session_state.generated_summary:
    st.write("<script>navigator.clipboard.writeText(`{}`)</script>".format(st.session_state.generated_summary), unsafe_allow_html=True)
    status.success("Summary copied to clipboard!")
elif copy_button:
    status.error("No summary to copy.")

# CSS for styling
st.markdown("""
<style>
    .stTextArea textarea {
        width: 100%;
        max-width: 700px;
    }
    .stButton>button {
        width: 150px;
    }
    .stForm {
        background-color: #f5f5f5;
        padding: 20px;
        border-radius: 10px;
        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
    }
</style>
""", unsafe_allow_html=True)
'''

with open("app.py", "w") as f:
    f.write(frontend_code)

# Install pyngrok if not already installed
!pip install pyngrok

# Mount Google Drive to access model files
from google.colab import drive
drive.mount('/content/drive')

# Set your ngrok authtoken here
from pyngrok import ngrok

# Replace 'YOUR_AUTHTOKEN_HERE' with your actual ngrok authtoken
ngrok.set_auth_token("35kJKjks8eW0SstgS7ceeKcJDkY_2bfXGWWyKq4AFGRvYAcgq")

# Run the app
import subprocess
import time
import threading

def run_streamlit():
    subprocess.run(['streamlit', 'run', 'app.py', '--server.port', '8501'])

# Start Streamlit in a separate thread
thread = threading.Thread(target=run_streamlit)
thread.start()

# Wait a moment for the server to start
time.sleep(3)

# Create ngrok tunnel
public_url = ngrok.connect(8501)
print(f"Streamlit app is running at: {public_url}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Streamlit app is running at: NgrokTunnel: "https://99684eb3c7f6.ngrok-free.app" -> "http://localhost:8501"


# Test 10

In [16]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import math
import os
from rouge_score import rouge_scorer

# Step 1: Load and prepare the dataset
df = pd.read_csv('/content/drive/MyDrive/Transformer/mtsamples.csv')
df = df[['transcription', 'description']].dropna()
df = df.rename(columns={'transcription': 'text', 'description': 'summary'})

# Step 2: Train a WordPiece tokenizer on the dataset
texts = list(df['text']) + list(df['summary'])
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=30000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train_from_iterator(texts, trainer=trainer)

# Save tokenizer
tokenizer_path = "/content/drive/MyDrive/Transformer"
os.makedirs(tokenizer_path, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_path, "tokenizer10.json"))

# Encode dataset
def encode_texts(texts, summaries, max_input_length=512, max_target_length=128):
    input_encodings = []
    target_encodings = []
    for text, summary in zip(texts, summaries):
        input_ids = tokenizer.encode(text).ids
        target_ids = tokenizer.encode(summary).ids
        if len(input_ids) > max_input_length:
            input_ids = input_ids[:max_input_length]
        if len(target_ids) > max_target_length:
            target_ids = target_ids[:max_target_length]
        input_encodings.append(input_ids)
        target_encodings.append(target_ids)
    return input_encodings, target_encodings

input_encodings, target_encodings = encode_texts(df['text'], df['summary'])

# Pad sequences
def pad_sequences(sequences, max_length, pad_token_id):
    padded = []
    for seq in sequences:
        if len(seq) < max_length:
            seq = seq + [pad_token_id] * (max_length - len(seq))
        padded.append(seq[:max_length])
    return padded

max_input_length = 512
max_target_length = 128
pad_token_id = tokenizer.token_to_id("[PAD]")
input_encodings = pad_sequences(input_encodings, max_input_length, pad_token_id)
target_encodings = pad_sequences(target_encodings, max_target_length, pad_token_id)

# Convert to tensors
input_tensors = torch.tensor(input_encodings, dtype=torch.long)
target_tensors = torch.tensor(target_encodings, dtype=torch.long)

# Split into train and validation (80-20)
train_size = int(0.8 * len(input_tensors))
train_inputs, val_inputs = input_tensors[:train_size], input_tensors[train_size:]
train_targets, val_targets = target_tensors[:train_size], target_tensors[train_size:]

# Step 3: Define Transformer model from scratch
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, src_mask, tgt_mask)
        output = self.fc_out(output)
        return output

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

# Instantiate model
vocab_size = tokenizer.get_vocab_size()
model = TransformerModel(vocab_size=vocab_size).cuda()

# Step 4: Training setup
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
scaler = GradScaler()

# Create dataset and dataloader
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.targets[idx]}

train_dataset = MedicalDataset(train_inputs, train_targets)
val_dataset = MedicalDataset(val_inputs, val_targets)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4)

# Step 5: Training loop
def train_epoch(model, loader, optimizer, criterion, scaler):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        src = batch['input_ids'].cuda()
        tgt = batch['labels'].cuda()
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

        with autocast():
            output = model(src, tgt_input, tgt_mask=tgt_mask)
            loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            src = batch['input_ids'].cuda()
            tgt = batch['labels'].cuda()
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).cuda()

            with autocast():
                output = model(src, tgt_input, tgt_mask=tgt_mask)
                loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
            total_loss += loss.item()

            # Decode for ROUGE
            preds = torch.argmax(output, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(tgt_output.cpu().numpy())
    return total_loss / len(loader), all_preds, all_labels

# Step 6: Train the model
num_epochs = 100  # More epochs due to training from scratch
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler)
    val_loss, val_preds, val_labels = evaluate(model, val_loader, criterion)

    # Compute ROUGE scores
    decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in val_preds]
    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in val_labels]
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(label, pred) for pred, label in zip(decoded_preds, decoded_labels)]
    rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, ROUGE-1: {rouge1:.4f}")

# Step 7: Save the model
model_path = "/content/drive/MyDrive/Transformer"
os.makedirs(model_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(model_path, "model10.pt"))
print(f"Model saved to: {model_path}")

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1, Train Loss: 6.7383, Val Loss: 6.0683, ROUGE-1: 0.1567
Epoch 2, Train Loss: 5.4853, Val Loss: 5.3811, ROUGE-1: 0.1923
Epoch 3, Train Loss: 4.7729, Val Loss: 4.9049, ROUGE-1: 0.1910
Epoch 4, Train Loss: 4.2098, Val Loss: 4.5147, ROUGE-1: 0.1838
Epoch 5, Train Loss: 3.6870, Val Loss: 4.1572, ROUGE-1: 0.2083
Epoch 6, Train Loss: 3.2272, Val Loss: 3.8064, ROUGE-1: 0.1485
Epoch 7, Train Loss: 2.7963, Val Loss: 3.4718, ROUGE-1: 0.2326
Epoch 8, Train Loss: 2.4013, Val Loss: 3.1770, ROUGE-1: 0.1899
Epoch 9, Train Loss: 2.0462, Val Loss: 2.8929, ROUGE-1: 0.2047
Epoch 10, Train Loss: 1.7241, Val Loss: 2.6538, ROUGE-1: 0.2512
Epoch 11, Train Loss: 1.4509, Val Loss: 2.3964, ROUGE-1: 0.2683
Epoch 12, Train Loss: 1.2181, Val Loss: 2.2055, ROUGE-1: 0.2766
Epoch 13, Train Loss: 1.0291, Val Loss: 2.0071, ROUGE-1: 0.2592
Epoch 14, Train Loss: 0.8620, Val Loss: 1.8126, ROUGE-1: 0.2896
Epoch 15, Train Loss: 0.7505, Val Loss: 1.7140, ROUGE-1: 0.2686
Epoch 16, Train Loss: 0.6345, Val Loss: 1.5619, R