# Mini Project

In [None]:
# =====================================================
# STEP 1 ‚Äî Install dependencies
# =====================================================
!pip install -q --upgrade transformers==4.44.2 datasets sentencepiece accelerate

In [None]:
# =====================================================
# STEP 2 ‚Äî Imports and setup
# =====================================================
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from sklearn.model_selection import train_test_split
import torch, os, logging

# Disable telemetry and W&B
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["WANDB_DISABLED"] = "true"
logging.disable(logging.WARNING)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# =====================================================
# STEP 3 ‚Äî Create simple dataset
# =====================================================
abstracts = [
    f"This paper introduces a new technique to enhance transformer performance for NLP task {i}."
    for i in range(1000)
]
titles = [f"Transformer Enhancement for NLP Task {i}" for i in range(1000)]

df = pd.DataFrame({"abstract": abstracts, "title": titles})
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# =====================================================
# STEP 4 ‚Äî Load model and tokenizer
# =====================================================
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

In [None]:
# =====================================================
# STEP 5 ‚Äî Tokenization (FIXED)
# =====================================================
def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["abstract"]]
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length"  # ‚úÖ ensures same length tensors
    )

    labels = tokenizer(
        text_target=examples["title"],
        max_length=64,
        truncation=True,
        padding="max_length"  # ‚úÖ fixes the ValueError
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# =====================================================
# STEP 6 ‚Äî Training arguments
# =====================================================
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    num_train_epochs=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    do_eval=True,  # compatible with older versions
    report_to=[],  # disables WandB/TensorBoard
)

In [None]:
# =====================================================
# STEP 7 ‚Äî Trainer
# =====================================================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [None]:

# =====================================================
# STEP 8 ‚Äî Train model
# =====================================================
trainer.train()

Step,Training Loss
50,4.8703
100,0.2917
150,0.0518
200,0.021
250,0.0176
300,0.0098
350,0.0069
400,0.0077
450,0.0115
500,0.0058


TrainOutput(global_step=900, training_loss=0.29678790864017274, metrics={'train_runtime': 2807.571, 'train_samples_per_second': 0.641, 'train_steps_per_second': 0.321, 'total_flos': 121807621324800.0, 'train_loss': 0.29678790864017274, 'epoch': 2.0})

In [None]:
# =====================================================
# STEP 9 ‚Äî Generate sample titles
# =====================================================
samples = [
    "This research presents a transformer-based model for multilingual sentiment analysis.",
    "A deep learning framework is introduced for early detection of lung cancer using CT scans.",
    "We explore optimization techniques for improving neural network efficiency."
]

print("\n================ Generated Titles ================\n")
for text in samples:
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=20, num_beams=4, early_stopping=True)
    print(f"Abstract:\n{text}\n")
    print(f"Generated Title: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
    print("--------------------------------------------------")



Abstract:
This research presents a transformer-based model for multilingual sentiment analysis.

Generated Title: Transformer-based model for multilingual sentiment analysis.
--------------------------------------------------
Abstract:
A deep learning framework is introduced for early detection of lung cancer using CT scans.

Generated Title: A deep learning framework is introduced for early detection of lung cancer using CT scans.
--------------------------------------------------
Abstract:
We explore optimization techniques for improving neural network efficiency.

Generated Title: Optimizement for improving neural network efficiency.
--------------------------------------------------


In [None]:
# @title
# =====================================================
# STEP 9 ‚Äî Test the trained model interactively
# =====================================================

print("\nModel ready! You can now input any abstract and get a predicted title.\n")

while True:
    user_input = input("Enter an abstract (or type 'exit' to quit): ").strip()
    if user_input.lower() == "exit":
        break

    inputs = tokenizer("summarize: " + user_input,
                       return_tensors="pt",
                       truncation=True,
                       padding=True).to(device)

    outputs = model.generate(**inputs, max_length=20, num_beams=4, early_stopping=True)
    generated_title = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\nüìù Predicted Title: {generated_title}\n")
