# 🧠 Transformer Paraphrasing (Colab Demo)
Colab notebook for running inference using a custom Transformer-based paraphrasing model.

---
**Features:**
- Loads model & vocab from GitHub using `gdown`
- Runs beam search decoding
- Supports token-level attention

---

In [33]:
# 📦 Install required libraries
!pip install -q nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
# ✅ Imports and global variables
import os, torch, pickle
import numpy as np
import torch.nn as nn
from nltk.tokenize import word_tokenize
from urllib.request import urlretrieve

MAX_LEN = 20
EMBED_DIM = 100
HIDDEN_DIM = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
GLOVE_PATH = "glove.6B.100d.txt"

In [35]:
# 🔣 Tokenization and GloVe loading
def tokenize(text): return word_tokenize(text.lower())

def sentence_to_indices(sentence, vocab):
    return [vocab.get(w, vocab['<unk>']) for w in tokenize(sentence)]

def load_glove_embeddings(vocab):
    if not os.path.exists(GLOVE_PATH):
        print("⏬ Downloading GloVe embeddings...")
        urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", "glove.zip")
        import zipfile
        with zipfile.ZipFile("glove.zip", "r") as zip_ref:
            zip_ref.extractall()
    matrix = np.random.uniform(-0.1, 0.1, (len(vocab), EMBED_DIM))
    matrix[vocab['<pad>']] = 0
    with open(GLOVE_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            w, *vec = line.strip().split()
            if w in vocab:
                matrix[vocab[w]] = np.array(vec, dtype=np.float32)
    return torch.tensor(matrix, dtype=torch.float)

In [36]:
# 🧠 Transformer model class
class TransformerParaphraser(nn.Module):
    def __init__(self, input_vocab, target_vocab, emb1, emb2):
        super().__init__()
        self.src_embed = nn.Embedding.from_pretrained(emb1, freeze=False)
        self.tgt_embed = nn.Embedding.from_pretrained(emb2, freeze=False)
        self.pos_enc = nn.Parameter(torch.rand(1, MAX_LEN, EMBED_DIM))
        self.tr = nn.Transformer(d_model=EMBED_DIM, nhead=4, num_encoder_layers=2, num_decoder_layers=2,
                                dim_feedforward=512, batch_first=True)
        self.fc = nn.Linear(EMBED_DIM, len(target_vocab))
        self.pad_idx = target_vocab['<pad>']

    def forward(self, src, tgt):
        src_mask = src == self.pad_idx
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(DEVICE)
        src = self.src_embed(src) + self.pos_enc[:, :src.size(1)]
        tgt = self.tgt_embed(tgt) + self.pos_enc[:, :tgt.size(1)]
        out = self.tr(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_mask)
        return self.fc(out)

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [37]:
# 📥 Safe GitHub download with verification
import requests

def safe_download(url, filename):
    try:
        r = requests.get(url, allow_redirects=True, timeout=10)
        if r.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(r.content)
            # Additional check: try loading to validate pickle file
            if filename.endswith('.pkl'):
                with open(filename, 'rb') as test_f:
                    pickle.load(test_f)
        else:
            raise Exception(f"Download failed: {url}, status code {r.status_code}")
    except Exception as e:
        raise RuntimeError(f"Download or verification failed for {filename}: {str(e)}")

safe_download("https://huggingface.co/datasets/shubham27eu/paraphraser/resolve/main/vocab.pkl", "vocab.pkl")
safe_download("https://huggingface.co/datasets/shubham27eu/paraphraser/resolve/main/paraphrase_model.pt", "paraphrase_model.pt")

Exception: Failed to download vocab.pkl: status code 401

In [None]:
with open("vocab.pkl", "rb") as f:
    input_vocab, target_vocab = pickle.load(f)
idx2word = {i: w for w, i in target_vocab.items()}
input_emb = load_glove_embeddings(input_vocab)
target_emb = load_glove_embeddings(target_vocab)
model = TransformerParaphraser(input_vocab, target_vocab, input_emb, target_emb).to(DEVICE)
model.load_state_dict(torch.load("paraphrase_model.pt", map_location=DEVICE))
model.eval()

In [None]:
# 🔁 Beam decoding
def beam_decode(model, sentence, input_vocab, target_vocab, idx2word, beam_width=5, max_len=MAX_LEN, alpha=0.7):
    model.eval()
    src_tokens = sentence_to_indices(sentence, input_vocab)
    src_tensor = torch.tensor(src_tokens[:MAX_LEN] + [input_vocab['<pad>']] * (MAX_LEN - len(src_tokens))).unsqueeze(0).to(DEVICE)
    src_mask = (src_tensor == input_vocab['<pad>'])
    memory = model.src_embed(src_tensor) + model.pos_enc[:, :src_tensor.size(1)].to(DEVICE)
    memory = model.tr.encoder(memory, src_key_padding_mask=src_mask)
    beams = [(torch.tensor([target_vocab['<sos>']], device=DEVICE), [], 0.0)]
    for _ in range(max_len):
        new_beams = []
        for tokens, words, score in beams:
            tgt_mask = model.generate_square_subsequent_mask(tokens.size(0)).to(DEVICE)
            tgt_emb = model.tgt_embed(tokens.unsqueeze(0)) + model.pos_enc[:, :tokens.size(0)]
            decoder_out = model.tr.decoder(tgt_emb, memory, tgt_mask=tgt_mask)
            logits = model.fc(decoder_out[:, -1])
            log_probs = torch.log_softmax(logits, dim=-1)
            topk = torch.topk(log_probs, beam_width)
            for i in range(beam_width):
                idx = topk.indices[0][i].item()
                word = idx2word.get(idx, '<unk>')
                new_score = score + topk.values[0][i].item()
                if word == '<eos>':
                    return ' '.join(words)
                new_tokens = torch.cat([tokens, torch.tensor([idx], device=DEVICE)])
                new_beams.append((new_tokens, words + [word], new_score))
        beams = sorted(new_beams, key=lambda x: x[2] / ((5 + len(x[1])) ** alpha / 6**alpha), reverse=True)[:beam_width]
    return ' '.join(beams[0][1])

In [None]:
# 🧪 Try out paraphrasing
while True:
    sentence = input("\nEnter sentence to paraphrase (or type 'exit'): ")
    if sentence.lower().strip() == 'exit': break
    print("Paraphrase:", beam_decode(model, sentence, input_vocab, target_vocab, idx2word))

In [None]:
!wget -q https://raw.githubusercontent.com/vaibhavkl/Quora-Question-Pairs-Dataset/master/quora_duplicate_questions.tsv
import pandas as pd

df = pd.read_csv("quora_duplicate_questions.tsv", sep='\t')
df = df[['question1', 'question2', 'is_duplicate']].dropna()
pairs = [(q1, q2) for q1, q2, dup in df.values if dup == 1]
print("Total duplicate pairs:", len(pairs))


In [None]:
from collections import Counter

def build_vocab(sentences, min_freq=2):
    counter = Counter()
    for s in sentences:
        counter.update(tokenize(s))
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

all_q1 = [q1 for q1, _ in pairs]
all_q2 = [q2 for _, q2 in pairs]
input_vocab = build_vocab(all_q1)
target_vocab = build_vocab(all_q2)
with open("vocab.pkl", "wb") as f:
    pickle.dump((input_vocab, target_vocab), f)


In [None]:
from torch.utils.data import Dataset, DataLoader

class ParaphraseDataset(Dataset):
    def __init__(self, pairs, input_vocab, target_vocab):
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.target_vocab = target_vocab

    def __len__(self): return len(self.pairs)

    def __getitem__(self, idx):
        q1, q2 = self.pairs[idx]
        x = sentence_to_indices(q1, self.input_vocab)
        y = [self.target_vocab['<sos>']] + sentence_to_indices(q2, self.target_vocab) + [self.target_vocab['<eos>']]
        x = x[:MAX_LEN] + [self.input_vocab['<pad>']] * (MAX_LEN - len(x))
        y = y[:MAX_LEN] + [self.target_vocab['<pad>']] * (MAX_LEN - len(y))
        return torch.tensor(x), torch.tensor(y)

train_data = ParaphraseDataset(pairs[:50000], input_vocab, target_vocab)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)


In [None]:
input_emb = load_glove_embeddings(input_vocab)
target_emb = load_glove_embeddings(target_vocab)

model = TransformerParaphraser(input_vocab, target_vocab, input_emb, target_emb).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=target_vocab['<pad>'])

for epoch in range(5):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        tgt_inp = y[:, :-1]
        tgt_out = y[:, 1:]

        pred = model(x, tgt_inp)
        pred = pred.reshape(-1, pred.shape[-1])
        tgt_out = tgt_out.reshape(-1)

        loss = criterion(pred, tgt_out)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


In [None]:
torch.save(model.state_dict(), "paraphrase_model.pt")
print("✅ Saved vocab.pkl and paraphrase_model.pt!")
