In [3]:
!pip -q install -U "transformers>=4.44" "datasets>=2.20" "peft>=0.12" accelerate evaluate rouge-score scikit-learn sentencepiece "pyarrow<20.0.0a0"

/bin/bash: line 1: /home/ruksad/Documents/learn/AIandContent/GL-selflearning-sem2/flan-t5-finetuning/projects/bin/pip: cannot execute: required file not found


## Load model and tokenizer
We'll use the small FLAN-T5 model to keep things light.
- Tokenizer converts text â†” tokens
- Model generates outputs given the tokens

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [5]:
print(model.config )

T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "dtype": "float32",
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
     

In [6]:
print(f"Hidden size (d_model): {model.config.d_model}")  
print(f"Encoder layers: {model.config.num_layers}")      
print(f"Decoder layers: {model.config.num_decoder_layers}")  

print(f"Number of attention heads: {model.config.num_heads}")
print(f"Key-value dimension per head: {model.config.d_kv}")
print(f"Total Q/K/V dimension: {model.config.num_heads * model.config.d_kv}")

Hidden size (d_model): 512
Encoder layers: 8
Decoder layers: 8
Number of attention heads: 6
Key-value dimension per head: 64
Total Q/K/V dimension: 384


In [7]:
# See all parameter names
for name, param in model.named_parameters():
    if 'SelfAttention' in name and 'q' in name:
        print(f"{name}: {param.shape}")
        

encoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.2.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.3.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.4.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.5.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.6.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
encoder.block.7.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.0.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.1.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.2.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.3.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.4.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decoder.block.5.layer.0.SelfAttention.q.weight: torch.Size([384, 512])
decode

In [8]:
# Total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")  # 76,961,152

# trainable parameters
trainable = sum(p.numel() for p in model.parameters() 
                  if p.requires_grad)

print(f"trainable parameters: {trainable:,}")  # ~6,144,512

Total parameters: 76,961,152
trainable parameters: 76,961,152


In [None]:
# Check a specific attention layer
encoder_attn = model.encoder.block[0].layer[0].SelfAttention

print("Query weight shape:", encoder_attn.q.weight.shape)  # (384, 512)
print("Key weight shape:", encoder_attn.k.weight.shape)    # (384, 512)
print("Value weight shape:", encoder_attn.v.weight.shape)  # (384, 512)
print("Output weight shape:", encoder_attn.o.weight.shape) # (384, 512)


Query weight shape: torch.Size([384, 512])
Key weight shape: torch.Size([384, 512])
Value weight shape: torch.Size([384, 512])
Output weight shape: torch.Size([512, 384])


## Zero-shot classification (SST-2 style)
FLAN-T5 understands instructions. For SST-2, prompting with `sst2: <text>` often produces `positive` or `negative`.
We'll write a tiny helper to classify one or more texts.

In [10]:
def classify(texts, max_new_tokens=4):
    if isinstance(texts, str):
        texts = [texts]
    prompts = [f"sst2: {t}" for t in texts]
    enc = tokenizer(prompts, return_tensors='pt', padding=True).to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new_tokens)
    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)
    # Normalize a bit for readability
    return [d.strip().split(' ')[0].lower() for d in decoded]

examples = [
    "I absolutely loved this movie. It was fantastic!",
    "The plot was predictable and the acting was bad.",
    "Not great, not terrible."
]
preds = classify(examples)
for t, p in zip(examples, preds):
    print(f"Text: {t}\nPrediction: {p}")

Text: I absolutely loved this movie. It was fantastic!
Prediction: i
Text: The plot was predictable and the acting was bad.
Prediction: sst
Text: Not great, not terrible.
Prediction: sst


## Zero-shot classification (SST-2 style)
FLAN-T5 understands instructions. For SST-2, prompting with `sst2: <text>` often produces `positive` or `negative`.
We'll write a tiny helper to classify one or more texts.

In [13]:
def classify(texts, max_new_tokens=4):
    if isinstance(texts, str):
        texts = [texts]
    prompts = [f"sst2: {t}" for t in texts]
    enc = tokenizer(prompts, return_tensors='pt', padding=True).to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new_tokens)
    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)
    # Normalize a bit for readability
    return [d.strip().split(' ')[0].lower() for d in decoded]

examples = [
    "I absolutely loved this movie. It was fantastic!",
    "The plot was predictable and the acting was bad.",
    "Not great, not terrible."
]

preds = classify(examples)
for t, p in zip(examples, preds):
    print(f"Text: {t}\nPrediction: {p}")

Text: I absolutely loved this movie. It was fantastic!
Prediction: i
Text: The plot was predictable and the acting was bad.
Prediction: sst
Text: Not great, not terrible.
Prediction: sst


## Zero-shot summarization
For summarization, prefix the input with `summarize:` and provide the content (e.g., a short dialogue).

In [15]:
def summarize(text, max_new_tokens=80):
    prompt = f'summarize: {text}'
    enc = tokenizer(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_new_tokens)
    return tokenizer.decode(out[0], skip_special_tokens=True)

dialogue = (
    "John: Let's meet at 5 pm.\n"
    "Jane: Can we do 6 pm instead?\n"
    "John: Sure. See you then."
)
print(summarize(dialogue))

John and Jane will meet at 5 pm.
