In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW, get_scheduler
from datasets import load_metric, list_metrics

In [2]:
train = pd.read_csv('/kaggle/input/clickbait-task2/train_data.csv')
val = pd.read_csv('/kaggle/input/clickbait-task2/val_data.csv')

In [3]:
train.head()

Unnamed: 0,text,output,labels
0,['wes welker wanted dinner with tom brady but ...,['how about that morning we go throw'],['passage']
1,['nasa sets date for full recovery of ozone ho...,['2070'],['phrase']
2,['this is what makes employees happy and its ...,['intellectual stimulation'],['phrase']
3,['passion is overrated \xa07 work habits you n...,['purpose connects us to something bigger and ...,['multi']
4,['the perfect way to cook rice so that its per...,['in a rice cooker'],['phrase']


In [4]:
val.head()

Unnamed: 0,text,output,labels
0,['five nights at freddys sequel delayed for we...,['some of the plot elements are so disturbing ...,['passage']
1,['why arizona sheriff joe arpaios fate could h...,"['intentionally', 'could transform a court cas...",['multi']
2,['heres how much you should be tipping your ha...,['20'],['phrase']
3,"['harry potter alums reunite for new movie', '...","['alan rickman rupert grint', 'cbgb']",['multi']
4,['a man swallowed a microsd card and you wont ...,['a man who swallowed a 64gb microsd card and ...,['passage']


In [5]:
train_inputs = train['text'].astype(str).tolist()
train_outputs = train['output'].astype(str).tolist()

In [6]:
val_inputs = val['text'].astype(str).tolist()
val_outputs = val['text'].astype(str).tolist()

In [7]:
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [8]:
train_encodings = tokenizer(train_inputs, padding=True, truncation=True, return_tensors="pt", max_length=512)
train_labels = tokenizer(train_outputs, padding=True, truncation=True, return_tensors="pt", max_length=256)

val_encodings = tokenizer(val_inputs, padding=True, truncation=True, max_length=512)
val_labels = tokenizer(val_outputs, padding=True, truncation=True, max_length=256)

In [9]:
class BartlargecnnDataset(Dataset):
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

In [10]:
train_dataset = BartlargecnnDataset(train_encodings, train_labels)
val_dataset = BartlargecnnDataset(val_encodings, val_labels)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [12]:
lr = 5e-5
epochs = 10

optimizer = AdamW(model.parameters(), lr = lr)

num_training_steps = epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)




In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)
print('Using device:', device)

Using device: cuda


In [14]:
pip install -U nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [15]:
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
for epoch in range(epochs):
  model.train()
  for batch in train_loader:
        
    batch = {k: v.to(device) for k, v in batch.items()}
    
    optimizer.zero_grad()

    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()

  print(f"Epoch: {epoch +1} / {epochs}, Loss: {loss.item()}")

  model.eval()
  predictions = []
  references = []
  val_loss = 0

  for batch in val_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)
          loss = outputs.loss
          val_loss += loss.item()
          input_ids = batch['input_ids']
          inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
          generate_outputs = model.generate(input_ids)
          preds = tokenizer.batch_decode(generate_outputs, skip_special_tokens=True)
          labels = batch['labels']
          refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
          predictions.extend(preds)
          references.extend(refs)
  
  average_val_loss = val_loss / len(val_loader)
  print(f"Validation Loss: {average_val_loss}")

  tokenized_predictions = [word_tokenize(pred) for pred in predictions]
  tokenized_references = [word_tokenize(ref) for ref in references]

  meteor_scores = [meteor_score([ref], pred) for pred, ref in zip(tokenized_predictions, tokenized_references)]
  average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
  print(f"METEOR Score: {average_meteor}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels['input_ids'][idx])


Epoch: 1 / 10, Loss: 0.047888290137052536
Validation Loss: 0.2975501385331154
METEOR Score: 0.21864447254359573
Epoch: 2 / 10, Loss: 0.006322705186903477
Validation Loss: 0.3339382611215115
METEOR Score: 0.27483293907383166
Epoch: 3 / 10, Loss: 0.11850340664386749
Validation Loss: 0.44221220090985297
METEOR Score: 0.2230323903366123
Epoch: 4 / 10, Loss: 0.030949953943490982
Validation Loss: 0.4217720675468445
METEOR Score: 0.21691047218971252
Epoch: 5 / 10, Loss: 0.005358950700610876
Validation Loss: 0.4937809407711029
METEOR Score: 0.20160970400757383
Epoch: 6 / 10, Loss: 0.002858840860426426
Validation Loss: 0.5765578031539917
METEOR Score: 0.2141425166717923
Epoch: 7 / 10, Loss: 0.0011548403417691588
Validation Loss: 0.5470558378100395
METEOR Score: 0.21700375683777778
Epoch: 8 / 10, Loss: 0.0005930097540840507
Validation Loss: 0.5334890785813332
METEOR Score: 0.21553154612047457
Epoch: 9 / 10, Loss: 0.004765598569065332
Validation Loss: 0.6139420327544213
METEOR Score: 0.2182595709

In [17]:
torch.save(model.state_dict(), '/kaggle/working/Bart_large_cnn_model_weights.pth')

In [18]:
tokenizer.save_pretrained('/kaggle/working/')

('/kaggle/working/tokenizer_config.json',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/vocab.json',
 '/kaggle/working/merges.txt',
 '/kaggle/working/added_tokens.json')

In [19]:
  test = pd.read_csv('/kaggle/input/clickbait-task2/test.csv')

In [20]:
test.head()

Unnamed: 0,id,features
0,0,['he tackles a nurse at the hospital then you ...
1,1,"['why you should be selfish at work', 'were al..."
2,2,['the one strange trick that will make you liv...
3,3,['nerd wins scrabble championship with word yo...
4,4,['the bizarre new way to eat eggs that has eve...


In [21]:
test_inputs = test['features'].astype(str).tolist()

In [22]:
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=512)

In [23]:
class TestDataset(Dataset):
  def __init__(self, encodings):
        self.encodings = encodings

  def __len__(self):
        return len(self.encodings['input_ids'])

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

In [24]:
test_data = TestDataset(test_encodings)
test_loader = DataLoader(test_data, batch_size= 4 , shuffle = False)

In [25]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids']
        outputs = model.generate(input_ids)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(preds)

In [26]:
pred_df = pd.DataFrame(columns = ['id', 'spoiler'])

In [27]:
pred_df['id'] = test['id']
pred_df['spoiler'] = predictions

In [28]:
pred_df.head()

Unnamed: 0,id,spoiler
0,0,['gave him an experimental kidney transplant a...
1,1,['giving at the expense of your own wellbeing ...
2,2,['meditating inside a beautiful stockphoto roo...
3,3,"['braconid', 'any of numerous wasps of the fam..."
4,4,['cured egg yolks are deliciousbut strong beca...


In [29]:
pred_df['spoiler'] = pred_df['spoiler'].apply(lambda x: ', '.join(x.strip("[]'").split("', '")))

In [30]:
pred_df.to_csv('/kaggle/working/output_BART_lr5e-5_large_cnn_file.csv', index = False)

In [None]:
accumulation_steps = 4 
total_loss = 0.0

for epoch in range(epochs):
  model.train()
  total_loss = 0.0 
    
  for step, batch in enumerate(train_loader):
        
    batch = {k: v.to(device) for k, v in batch.items()}
    if (step + 1) % accumulation_steps == 0:
        optimizer.zero_grad()

    outputs = model(**batch)
    loss = outputs.loss /accumulation_steps
    loss.backward()
    total_loss += loss.item()
    
    if (step + 1) % accumulation_steps == 0:
        optimizer.step()
        lr_scheduler.step()
        
  avg_loss = total_loss / len(train_loader) 
  print(f"Epoch: {epoch +1} / {num_epochs}, Training_Loss: {avg_loss}")

  model.eval()
  predictions = []
  references = []
  val_loss = 0.0

  for batch in val_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      val_loss += loss.item()
      input_ids = batch['input_ids']
      inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
      generate_outputs = model.generate(input_ids)
      preds = tokenizer.batch_decode(generate_outputs, skip_special_tokens=True)
      labels = batch['labels']
      refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
      predictions.extend(preds)
      references.extend(refs)
    
  average_val_loss = val_loss / len(val_loader)
  print(f"Validation Loss: {average_val_loss}")
        
  tokenized_predictions = [word_tokenize(pred) for pred in predictions]
  tokenized_references = [word_tokenize(ref) for ref in references]

  meteor_scores = [meteor_score([ref], pred) for pred, ref in zip(tokenized_predictions, tokenized_references)]
  average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
  print(f"METEOR Score: {average_meteor}")