In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq

# model_name = "fine_tuned_t5_2"
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
from datasets import Dataset

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("cleaned_data.csv")

In [5]:
data.head()

Unnamed: 0,News,Summary
0,Ad sales boost Time Warner profit Quarterly pr...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech The dollar ha...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim The owners o...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits British Airw...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq Shares in UK...,Pernod has reduced the debt it took on to fund...


In [6]:
data = data[['News', 'Summary']]

In [7]:
data.shape

(1714, 2)

In [8]:
dataset = Dataset.from_pandas(data)

In [9]:
def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["News"]]
    model_inputs = tokenizer(inputs, max_length=2400, truncation=True)
    labels = tokenizer(text_target=examples["Summary"], max_length=1100, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1714 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
val_dataset = split['test']

In [12]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [13]:
for param in model.base_model.encoder.parameters():
    param.requires_grad = False

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="fine_tuned_model_v2",
    eval_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_total_limit=2,
    save_steps=500,
    warmup_steps=200,
    logging_dir="./logs",
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [16]:
trainer.train()

model.save_pretrained("./fine_tuned_t5_final")
tokenizer.save_pretrained("./fine_tuned_t5_final")

  0%|          | 0/2313 [00:00<?, ?it/s]

{'loss': 1.3824, 'grad_norm': 2.4882595539093018, 'learning_rate': 4.290108849976337e-06, 'epoch': 0.65}


  0%|          | 0/86 [00:00<?, ?it/s]

{'eval_loss': 0.6548317074775696, 'eval_runtime': 92.5563, 'eval_samples_per_second': 1.858, 'eval_steps_per_second': 0.929, 'epoch': 1.0}
{'loss': 0.985, 'grad_norm': 2.1617467403411865, 'learning_rate': 3.106956933270232e-06, 'epoch': 1.3}
{'loss': 0.9032, 'grad_norm': 1.950774073600769, 'learning_rate': 1.923805016564127e-06, 'epoch': 1.95}


  0%|          | 0/86 [00:00<?, ?it/s]

{'eval_loss': 0.5445525646209717, 'eval_runtime': 85.5524, 'eval_samples_per_second': 2.01, 'eval_steps_per_second': 1.005, 'epoch': 2.0}
{'loss': 0.849, 'grad_norm': 2.110037088394165, 'learning_rate': 7.406530998580218e-07, 'epoch': 2.59}


  0%|          | 0/86 [00:00<?, ?it/s]

{'eval_loss': 0.5193162560462952, 'eval_runtime': 85.3749, 'eval_samples_per_second': 2.015, 'eval_steps_per_second': 1.007, 'epoch': 3.0}
{'train_runtime': 5991.1663, 'train_samples_per_second': 0.772, 'train_steps_per_second': 0.386, 'train_loss': 1.001284205547618, 'epoch': 3.0}


('./fine_tuned_t5_final\\tokenizer_config.json',
 './fine_tuned_t5_final\\special_tokens_map.json',
 './fine_tuned_t5_final\\spiece.model',
 './fine_tuned_t5_final\\added_tokens.json',
 './fine_tuned_t5_final\\tokenizer.json')

In [1]:
from evaluate import load

rouge = load("rouge")
results = rouge.compute(
    predictions=[tokenizer.decode(g, skip_special_tokens=True) for g in trainer.predict(val_dataset).predictions],
    references=[ex["Summary"] for ex in val_dataset],
)
print(results)

NameError: name 'trainer' is not defined

In [2]:
import nltk
from rouge_score import rouge_scorer
import absl

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_t5_final")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_t5_final")

In [6]:
def summarize_text(input_text, max_length=1500, min_length=200, num_beams=4):

    inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", truncation=True)
    # Generate the summary
    outputs = model.generate(
        inputs, 
        max_length=max_length, 
        min_length=min_length, 
        length_penalty=2.0, 
        num_beams=num_beams, 
        early_stopping=True
    )
    # Decode the summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [7]:
input_text = """ Right-handed batter Gongadi Trisha scripted history as she became the first centurion in the history of Women's U19 T20 World Cup. On Tuesday, the youngster smashed a ton off just 53 balls in India's Super Six clash against Scotland at the Bayuemas Oval in Kuala Lumpur.

Right-handed batter Gongadi Trisha scripted history as she became the first centurion in the history of Women's U19 T20 World Cup(ICC - X)
Right-handed batter Gongadi Trisha scripted history as she became the first centurion in the history of Women's U19 T20 World Cup(ICC - X)
She eventually remained unbeaten on 110 off just 59 balls. Her innings was studded with 13 fours and 4 sixes. She and Sanika Chalke powered India to a commanding total of 208/1 in the allotted twenty overs.
Earlier, Scotland had won the toss and opted to field. India openers Kamalini G and Trisha Gongadi started the innings with a bang, hitting boundaries constantly. The duo put on 67 runs in the powerplay.

The opening batters put on a stand of 147 runs for the first wicket. Kamalini eventually lost her wicket in the 14th over of the innings as she walked back to the hut after scoring 51 off 42 balls.

The opening stand of 147 runs is the highest partnership for any wicket in the ongoing Women's U19 T20 World Cup.

Sanika Chalke also remained unbeaten on 29 as India posted more than 200 runs on the board.

India register a commanding 150-run win
Aayushi Shukla, Vaishnavi Sharma and Gongadi Trisha took all the ten wickets between themselves to bundle out Scotland for 58 inside 14 overs.
Aayushi took four wickets, while Vaishnavi and Trisha took three wickets each. For Scotland, opening batters Pippy Kelly and Emma Walsingham were the only ones who seemed cut out to handle the pressure situation.

The rest of the batters collapsed in quick succession, and India ultimately registered a comprehensive win. Gongadi Trisha was adjudged as Player of the Match for her perfect all-round effort.

India have already qualified for the semi-finals and the defending champions will play their semi-final on Friday, January 31.

The final of the tournament is scheduled for Sunday, February 2.
"""
summary = summarize_text(input_text)

print("Input Text:")
print(input_text)
print("\nGenerated Summary:")
print(summary)

Input Text:
 Right-handed batter Gongadi Trisha scripted history as she became the first centurion in the history of Women's U19 T20 World Cup. On Tuesday, the youngster smashed a ton off just 53 balls in India's Super Six clash against Scotland at the Bayuemas Oval in Kuala Lumpur.

Right-handed batter Gongadi Trisha scripted history as she became the first centurion in the history of Women's U19 T20 World Cup(ICC - X)
Right-handed batter Gongadi Trisha scripted history as she became the first centurion in the history of Women's U19 T20 World Cup(ICC - X)
She eventually remained unbeaten on 110 off just 59 balls. Her innings was studded with 13 fours and 4 sixes. She and Sanika Chalke powered India to a commanding total of 208/1 in the allotted twenty overs.
Earlier, Scotland had won the toss and opted to field. India openers Kamalini G and Trisha Gongadi started the innings with a bang, hitting boundaries constantly. The duo put on 67 runs in the powerplay.

The opening batters put o