In [None]:
# Source: https://huggingface.co/learn/nlp-course/en/chapter7/5?fw=pt

In [1]:
!pip install transformers==4.30
!pip install accelerate -U
!pip install rouge_score
!pip install datasets
!pip install evaluate

Collecting transformers==4.30
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
 

In [2]:
import pandas as pd
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import numpy as np
import nltk
import evaluate
rouge_score = evaluate.load("rouge")
nltk.download("punkt")
from transformers import DataCollatorForSeq2Seq
from nltk.tokenize import sent_tokenize

2024-04-25 03:18:48.705671: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 03:18:48.705799: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 03:18:48.957295: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
train_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/dataset12/hindi_train_final_TFIDF.csv")[["Article", "Summary"]].dropna())
val_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/dataset12/hindi_val_final_TFIDF.csv")[["Article", "Summary"]].dropna())
test_data = Dataset.from_pandas(pd.read_csv("/kaggle/input/dataset12/hindi_test_final_TFIDF.csv")[["Article", "Summary"]].dropna())

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [9]:
def tokenize_helper(dat):
    model_inputs = tokenizer(
        dat["Article"],
        truncation = True,
        max_length=1024
    )
    true_sent = tokenizer(dat["Summary"], truncation = True, max_length=1024)
    model_inputs["labels"] = true_sent["input_ids"]
    return model_inputs

tokenized_train = train_data.map(tokenize_helper, batched=True)
tokenized_val = val_data.map(tokenize_helper, batched=True)

Map:   0%|          | 0/18507 [00:00<?, ? examples/s]

Map:   0%|          | 0/1035 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
tokenized_train = tokenized_train.remove_columns(train_data.column_names)
tokenized_val = tokenized_val.remove_columns(val_data.column_names)

In [11]:
batch_size = 4
epochs = 10
args = Seq2SeqTrainingArguments(
    output_dir = "/kaggle/working/",
    evaluation_strategy = "epoch",
    learning_rate = 1e-3,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_total_limit=3,
    predict_with_generate=True,
    num_train_epochs = epochs,
    report_to="none"
)

In [12]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.7966,1.637821,13.2004,4.3305,12.7785,12.8664
2,1.527,1.567023,14.0599,4.6605,13.5192,13.617
3,1.3203,1.565878,14.9752,5.6739,14.3626,14.5618
4,1.1332,1.573497,15.5246,5.9472,15.0631,15.2146
5,0.9674,1.63958,14.6852,5.5157,14.174,14.2452
6,0.8092,1.696684,15.827,6.2175,15.1769,15.2791
7,0.7075,1.778143,15.2587,5.9976,14.6826,14.7489
8,0.5825,1.857941,16.1618,6.3795,15.5415,15.5941
9,0.5087,1.95907,16.3906,6.6327,15.7939,15.8918
10,0.4344,2.042619,16.2226,6.3906,15.6885,15.7218




TrainOutput(global_step=23140, training_loss=0.9853582190972726, metrics={'train_runtime': 16384.6236, 'train_samples_per_second': 11.295, 'train_steps_per_second': 1.412, 'total_flos': 8.508692666366362e+16, 'train_loss': 0.9853582190972726, 'epoch': 10.0})

In [15]:
trainer.evaluate()

{'eval_loss': 2.042618751525879,
 'eval_rouge1': 16.2226,
 'eval_rouge2': 6.3906,
 'eval_rougeL': 15.6885,
 'eval_rougeLsum': 15.7218,
 'eval_runtime': 76.4527,
 'eval_samples_per_second': 13.538,
 'eval_steps_per_second': 1.7,
 'epoch': 10.0}

In [17]:
temp = tokenizer(test_data["Article"][0], truncation = True, padding=True, return_tensors='pt').input_ids.to(device)
out = model.generate(temp, max_length=150, num_beams=4, early_stopping=True) 
print(tokenizer.decode(out[0], skip_special_tokens=True))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


उल्कापिंड जब धरती पर गिरते हैं, तो इनकी चमक इतनी ज्यादा होती है कि लगभग 200 से 300 किलोमीटर के दायरे के लोग आसमान में इसे देख सकते हैं। हालांकि इस घटना ने सबको आश्चर्य में डाल दिया है। दरअसल, उल्कापिंड जब धरती पर गिरते हैं, तो इनकी चमक इतनी ज्यादा होती है कि लगभग 200 से 300 किलोमीटर के दायरे के लोग आसमान में इसे देख सकते हैं।
