In [5]:
!pip install datasets rouge
!pip install accelerate -U
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainerCallback

from datasets import load_dataset
from rouge import Rouge

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
[0m  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rouge, pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6 rouge

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
def preprocess_data(data_to_process, tokenizer=tokenizer, max_input=1024, max_target=150):

  inputs = [dialogue for dialogue in data_to_process['article']]
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['highlights'], max_length=max_target, padding='max_length', truncation=True)

  model_inputs['labels'] = targets['input_ids']

  return model_inputs

In [7]:
train_path = "/content/train.csv"
test_path = "/content/test.csv"


import pandas as pd
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")
#train_data.to_csv('train.csv')
#train_data.to_csv('test.csv')


dataset = load_dataset('csv', data_files={'train': '/content/train.csv', 'test': '/content/test.csv'})

tokenized_data = dataset.map(preprocess_data, batched = True)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]



Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [8]:
train_data.head()

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,00a665151b89a53e5a08a389df8334f4106494c2,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...


In [9]:

train_data.shape

(13368, 3)

In [10]:

test_data.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [11]:

test_data.shape

(11490, 3)

In [12]:
def compute_rouge(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return {
        'rouge-1': scores['rouge-1']['f'],
        'rouge-2': scores['rouge-2']['f'],
        'rouge-l': scores['rouge-l']['f']
    }

class RougeCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        eval_results = trainer.evaluate()
        rouge_scores = compute_rouge(eval_results['predictions'], eval_results['references'])
        print(f"Epoch {state.epoch} - ROUGE Scores: {rouge_scores}")

In [13]:
# !pip install tensorflow --upgrade
# !pip install transformers --upgrade
for name, child in model.named_children():
    if name == 'model':
        for param in child.encoder.parameters():
            param.requires_grad = False

args = Seq2SeqTrainingArguments(
    output_dir='./bart-base-fine-tuned-decoder',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer
)

In [14]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.027518
2,1.225000,1.010635
3,0.972900,1.001307
4,0.910400,0.994825
5,0.873100,0.997529


TrainOutput(global_step=2090, training_loss=0.9897561944842909, metrics={'train_runtime': 9445.9631, 'train_samples_per_second': 7.076, 'train_steps_per_second': 0.221, 'total_flos': 1.4484927160516608e+17, 'train_loss': 0.9897561944842909, 'epoch': 5.0})

In [15]:
results = trainer.evaluate()

print(results)

{'eval_loss': 0.997528612613678, 'eval_runtime': 723.4477, 'eval_samples_per_second': 15.882, 'eval_steps_per_second': 0.498, 'epoch': 5.0}


In [16]:
# model.config.save_pretrained('./model-weights')
# model.save_pretrained('./model-weights')

output_dir = './model-weights'
model.save_pretrained(output_dir)
# Save the tokenizer
tokenizer.save_pretrained(output_dir)

('./model-weights/tokenizer_config.json',
 './model-weights/special_tokens_map.json',
 './model-weights/vocab.json',
 './model-weights/merges.txt',
 './model-weights/added_tokens.json',
 './model-weights/tokenizer.json')

In [17]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = '/content/model-weights'
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)



In [18]:
def predict(model,tokenizer,input_text):

  # Tokenize the input text
  input_ids = tokenizer.encode(input_text, return_tensors="pt")

  # Generate output
  output_ids = model.generate(input_ids, max_length=50, num_beams=5, length_penalty=0.6, early_stopping=True)
  output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  return output_text


In [19]:


real_highlights = test_data.highlights.head(100).to_list()
real_articles = test_data.article.head(100).to_list()

from rouge import Rouge

def compute_rouge(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return {
        'rouge-1': scores['rouge-1']['f'],
        'rouge-2': scores['rouge-2']['f'],
        'rouge-l': scores['rouge-l']['f']
    }

predictions = []
except_indexes = []
for i in range(len(real_articles)):
  try:
    predictions.append(predict(model,tokenizer,real_articles[i]))
  except:
    except_indexes.append(i)
    continue



Token indices sequence length is longer than the specified maximum sequence length for this model (1041 > 1024). Running this sequence through the model will result in indexing errors


In [20]:
except_highlights = [real_highlights[i] for i in except_indexes]

for i in except_highlights:
  real_highlights.remove(i)

In [21]:
len(predictions)

79

In [22]:
rouge_scores = compute_rouge(predictions, real_highlights)
print("ROUGE Scores:", rouge_scores)

ROUGE Scores: {'rouge-1': 0.3887882255526148, 'rouge-2': 0.1815083371363095, 'rouge-l': 0.36617290977476286}
