In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW, get_scheduler
from datasets import load_metric, list_metrics

# Data Loading

In [2]:
train = pd.read_csv('/kaggle/input/clickbait-task2/train_data.csv')

In [3]:
val = pd.read_csv('/kaggle/input/clickbait-task2/val_data.csv')

In [4]:
train.head()

Unnamed: 0,text,output,labels
0,['wes welker wanted dinner with tom brady but ...,['how about that morning we go throw'],['passage']
1,['nasa sets date for full recovery of ozone ho...,['2070'],['phrase']
2,['this is what makes employees happy and its ...,['intellectual stimulation'],['phrase']
3,['passion is overrated \xa07 work habits you n...,['purpose connects us to something bigger and ...,['multi']
4,['the perfect way to cook rice so that its per...,['in a rice cooker'],['phrase']


In [5]:
val.head()

Unnamed: 0,text,output,labels
0,['five nights at freddys sequel delayed for we...,['some of the plot elements are so disturbing ...,['passage']
1,['why arizona sheriff joe arpaios fate could h...,"['intentionally', 'could transform a court cas...",['multi']
2,['heres how much you should be tipping your ha...,['20'],['phrase']
3,"['harry potter alums reunite for new movie', '...","['alan rickman rupert grint', 'cbgb']",['multi']
4,['a man swallowed a microsd card and you wont ...,['a man who swallowed a 64gb microsd card and ...,['passage']


In [6]:
train_inputs = train['text'].astype(str).tolist()
train_outputs = train['output'].astype(str).tolist()

In [7]:
len(train_inputs)

3200

In [8]:
val_inputs = val['text'].astype(str).tolist()
val_outputs = val['output'].astype(str).tolist()

# Model Initialization

In [9]:
model_name = 'facebook/bart-base'
tokenizer = BartTokenizer.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

# Data Processing

In [10]:
train_encodings = tokenizer(train_inputs, truncation=True, padding=True, max_length=512)
train_labels = tokenizer(train_outputs, truncation=True, padding=True, max_length=128)

val_encodings = tokenizer(val_inputs, truncation=True, padding=True, max_length=512)
val_labels = tokenizer(val_outputs, truncation=True, padding=True, max_length=128)


In [11]:
print(len(train_inputs), len(train_outputs))
print(len(val_inputs), len(val_outputs))

3200 3200
400 400


In [12]:
class CustomDataset(Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __len__(self):
        return len(self.encodings['input_ids'])

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

In [13]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [14]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [15]:
print(len(train_encodings['input_ids']), len(train_labels['input_ids']))
print(len(val_encodings['input_ids']), len(val_labels['input_ids']))

3200 3200
400 400


# Initialization of Hyperparameters

In [16]:
lr = 1e-5

num_epochs = 10

model = BartForConditionalGeneration.from_pretrained(model_name)

optimizer = AdamW(model.parameters(), lr = lr)

num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]



In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)
print('Using device:', device)

Using device: cuda


In [18]:
pip install -U nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.


In [19]:
import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Model Training

In [20]:
for epoch in range(num_epochs):
  model.train()
  for batch in train_loader:
        
    batch = {k: v.to(device) for k, v in batch.items()}
    
    optimizer.zero_grad()

    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()

  print(f"Epoch: {epoch +1} / {num_epochs}, Loss: {loss.item()}")

  model.eval()
  predictions = []
  references = []

  for batch in val_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      input_ids = batch['input_ids']
      inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
      outputs = model.generate(input_ids)
      preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      labels = batch['labels']
      refs = tokenizer.batch_decode(labels, skip_special_tokens=True)
      predictions.extend(preds)
      references.extend(refs)
        
  tokenized_predictions = [word_tokenize(pred) for pred in predictions]
  tokenized_references = [word_tokenize(ref) for ref in references]

  meteor_scores = [meteor_score([ref], pred) for pred, ref in zip(tokenized_predictions, tokenized_references)]
  average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
  print(f"METEOR Score: {average_meteor}")

Epoch: 1 / 10, Loss: 0.4587289094924927




METEOR Score: 0.3153205749970845
Epoch: 2 / 10, Loss: 0.23644901812076569
METEOR Score: 0.343803944146498
Epoch: 3 / 10, Loss: 0.1984047144651413
METEOR Score: 0.3687793297898461
Epoch: 4 / 10, Loss: 0.2187434732913971
METEOR Score: 0.38094758759765457
Epoch: 5 / 10, Loss: 0.2370542585849762
METEOR Score: 0.38513954964922603
Epoch: 6 / 10, Loss: 0.16499093174934387
METEOR Score: 0.39680796725247197
Epoch: 7 / 10, Loss: 0.1084742546081543
METEOR Score: 0.3964216466789368
Epoch: 8 / 10, Loss: 0.1856965869665146
METEOR Score: 0.3924531891959302
Epoch: 9 / 10, Loss: 0.1198950782418251
METEOR Score: 0.3946389070310965
Epoch: 10 / 10, Loss: 0.05544552952051163
METEOR Score: 0.39750671925230563


In [21]:
torch.save(model.state_dict(), '/kaggle/working/Bart_lr_1e-5_model_weights.pth')

# Test Data Loading and Processing

In [22]:
  test = pd.read_csv('/kaggle/input/clickbait-task2/test.csv')

In [23]:
test.head()

Unnamed: 0,id,features
0,0,['he tackles a nurse at the hospital then you ...
1,1,"['why you should be selfish at work', 'were al..."
2,2,['the one strange trick that will make you liv...
3,3,['nerd wins scrabble championship with word yo...
4,4,['the bizarre new way to eat eggs that has eve...


In [24]:
test_inputs = test['features'].astype(str).tolist()

In [25]:
test_encodings = tokenizer(test_inputs, truncation=True, padding=True, max_length=512)

In [26]:
class TestDataset(Dataset):
  def __init__(self, encodings):
        self.encodings = encodings

  def __len__(self):
        return len(self.encodings['input_ids'])

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

In [27]:
test_data = TestDataset(test_encodings)
test_loader = DataLoader(test_data, batch_size= 8 , shuffle = False)

# Test Prediction

In [28]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids']
        outputs = model.generate(input_ids)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(preds)



In [29]:
clean_predictions = [pred.replace("[", "").replace("]", "").replace("'", "").strip() for pred in predictions]

In [30]:
pred_df = pd.DataFrame({
    'id': test['id'],
    'spoiler': clean_predictions
})

In [31]:
pred_df.head()

Unnamed: 0,id,spoiler
0,0,danny kolzow
1,1,"1 prioritise say yes when it matters most, 2 ..."
2,2,meditation inside a beautiful stockphoto room ...
3,3,braconid
4,4,cured egg yolks are deliciousbut strong becaus...


In [32]:
pred_df.to_csv('/kaggle/working/output_BART_lr_1e-5_file.csv',index=False)