In [None]:
# Source: https://huggingface.co/learn/nlp-course/en/chapter7/5?fw=pt

In [1]:
!pip install transformers==4.30
!pip install accelerate -U
!pip install rouge_score
!pip install datasets
!pip install evaluate

Collecting transformers==4.30
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling to

In [10]:
import pandas as pd
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import numpy as np
import nltk
import evaluate
rouge_score = evaluate.load("rouge")
nltk.download("punkt")
from transformers import DataCollatorForSeq2Seq
from nltk.tokenize import sent_tokenize
import csv

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
train_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/dataset-base/train.csv")[["Article", "Summary"]].dropna())
val_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/dataset-base/val.csv")[["Article", "Summary"]].dropna())
test_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/dataset-base/test.csv")[["Article", "Summary"]].dropna())

In [5]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

In [13]:
# Loading our fine-tuned models
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/models-textrank")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/models-textrank").to(device)

In [6]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [7]:
def tokenize_helper(dat):
    model_inputs = tokenizer(
        dat["Article"],
        truncation = True,
        max_length=1024
    )
    true_sent = tokenizer(dat["Summary"], truncation = True, max_length=1024)
    model_inputs["labels"] = true_sent["input_ids"]
    return model_inputs

tokenized_train = train_data.map(tokenize_helper, batched=True)
tokenized_val = val_data.map(tokenize_helper, batched=True)

Map:   0%|          | 0/19102 [00:00<?, ? examples/s]

Map:   0%|          | 0/1061 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
tokenized_train = tokenized_train.remove_columns(train_data.column_names)
tokenized_val = tokenized_val.remove_columns(val_data.column_names)

In [9]:
batch_size = 4
epochs = 10
args = Seq2SeqTrainingArguments(
    output_dir = "/kaggle/working/",
    evaluation_strategy = "epoch",
    learning_rate = 1e-3,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_total_limit=3,
    predict_with_generate=True,
    num_train_epochs = epochs,
    report_to="none"
)

In [10]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.0488,0.948492,15.3662,5.0464,14.9023,15.0378
2,0.7744,0.881916,16.8517,6.3322,16.2327,16.3434




TrainOutput(global_step=6368, training_loss=0.9972178828177141, metrics={'train_runtime': 6060.4683, 'train_samples_per_second': 6.304, 'train_steps_per_second': 1.051, 'total_flos': 3.833158954873651e+16, 'train_loss': 0.9972178828177141, 'epoch': 2.0})

In [8]:
trainer.evaluate()

NameError: name 'trainer' is not defined

In [14]:
def get_summary(article):
    temp = tokenizer(article, truncation = True, padding=True, return_tensors='pt', max_length=1024).input_ids.to(device)
    out = model.generate(temp, max_length=150, num_beams=5, length_penalty=1.2, early_stopping=True) 
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [15]:
print(get_summary(test_data["Article"][0]))

अभी भी उल्कापिंड जब धरती पर गिरते हैं, तो इनकी चमक इतनी ज्यादा होती है कि लगभग 200 से 300 किलोमीटर के दायरे के लोग आसमान में इसे देख सकते हैं। इस फुटेज को महाराष्ट्र के नागपुर और मध्य प्रदेश के झाबुआ एवं बड़वानी जिलों में देखे जाने की खबर है। खगोलविदों का मानना है कि यह घटना 'चीनी रॉकेट चरण का पुन: प्रवेश' थी, जिसे फरवरी 2021 में लॉन्च किया गया था।


In [None]:
test_summaries = []
for i in test_data["Article"]:
    temp_lst = []
    temp_lst.append(i)
    temp_lst.append(get_summary(i))
    test_summaries.append(temp_lst)
with open("/kaggle/working/test_base.csv", "w") as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(["Article", "Generated Summary"])
    csvwriter.writerows(test_summaries)