### Pre-Trained Model with Fine-Tuning - BART

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api")
wandb.login(key=my_secret)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import warnings
warnings.filterwarnings("ignore")

import random
import os
import pandas as pd
import numpy as np
import json
import re
import requests
import string
import matplotlib.pyplot as plt
from IPython.display import display, HTML

import spacy
from textblob import TextBlob
tokens = spacy.load("en_core_web_sm")
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from functools import reduce
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split

import nltk
nltk.download("stopwords")
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import norm
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.metrics import edit_distance

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import datasets
from datasets import load_metric
from datasets import Dataset

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Using device: cuda


In [None]:
data = pd.read_csv('/kaggle/input/final-data/final_data (1).csv')
data.drop_duplicates(['Target'], inplace=True)
data.drop_duplicates(['Clues'], inplace=True)
data.dropna(axis=0, inplace=True)

data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
def cleantext(text):
    string = re.sub(r'\([^)]*\)', '', string)
    string = re.sub("[^a-zA-Z.]", " ", string)
    string = text.lower()
    return string

ctargets, cclues = [], []

for i in data['Target']:
    ctargets.append(cleantext(i))

for j in data['Clues']:
    cclues.append(cleantext(j))

cdata = pd.DataFrame(columns=['Target', 'Clues'])
cdata['Target'] = ctargets
cdata['Clues'] = cclues

cdata.sample(10)

Unnamed: 0,Target,Clues
35353,hub,parental supervision focal point event revolve...
10950,pomatomus saltatrix,pomatomus percoid fish bluefish
33784,ground level,elevation floor
24173,conservation of energy,conservation law of thermodynamics
34932,historic period,era history
6167,gastrocybe,secotiaceae fungus genus gastrocybe lateritia
39957,manner of speaking,expressive style paralanguage
15487,allowance account,reserve account
41657,muff,handwear blunder
51607,spelter,zinc


In [None]:
cdata.replace('', np.nan, inplace=True)
cdata.dropna(axis=0, inplace=True)

cdata.shape

(57933, 2)

In [None]:
model_id = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(model_id)

metric = load_metric("rouge")
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def tokenize_input(data, max_len=400):
    return tokenizer(data['Target'], max_length=max_len, truncation=True)

def tokenize_target(data, max_len=50):
    with tokenizer.as_target_tokenizer():
        return tokenizer(data['Clues'], max_length=max_len, truncation=True)

def datapreprocessing(data):
    tokenized_input = tokenize_input(data)
    tokenized_target = tokenize_target(data)
    tokenized_input["labels"] = tokenized_target["input_ids"]
    return tokenized_input

cdata_train, cdata_test = train_test_split(cdata, test_size = 0.25, random_state = 42)
data_train = Dataset.from_pandas(cdata_train)
data_test = Dataset.from_pandas(cdata_test)

datapreprocessing(data_train[:1])

{'input_ids': [[0, 8687, 9230, 2918, 2]], 'attention_mask': [[1, 1, 1, 1, 1]], 'labels': [[0, 27739, 2918, 2]]}

In [None]:
data_train_tokenized = data_train.map(datapreprocessing, batched=True)
data_test_tokenized = data_test.map(datapreprocessing, batched=True)

print(data_train_tokenized[0].keys())
print(data_train_tokenized[0])

dict_keys(['Target', 'Clues', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'])
{'Target': 'teamsters union', 'Clues': 'industrial union', '__index_level_0__': 12974, 'input_ids': [0, 8687, 9230, 2918, 2], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [0, 27739, 2918, 2]}


In [None]:
def textgen(type, text, min_length=1, max_length=5):
    target = [e['text'] for e in type(text, min_length = min_length, max_length = max_length)]
    display(HTML(pd.DataFrame({"Clues":target, "Target":text}).to_html()))

pretrain = pipeline("text-generation", model = model, tokenizer = tokenizer, device = 0)

input = "Apple Mango Melon"
cinput = cleantext(input)

pretrain(cinput, min_length = 3, max_length = 5)

[{'summary_text': 'apple mango'}]

In [None]:
def decode_predictions(preds):
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    return ["\n".join(nltk.sent_tokenize(p.strip())) for p in decoded_preds]

def decode_targets(tgts):
    processed_targets = np.where(tgts != -100, tgts, tokenizer.pad_token_id)
    decoded_targets = tokenizer.batch_decode(processed_targets, skip_special_tokens=True)
    return ["\n".join(nltk.sent_tokenize(t.strip())) for t in decoded_targets]

def calculate_metrics(decoded_preds, decoded_tgts):
    eval_result = metric.compute(predictions=decoded_preds, references=decoded_tgts, use_stemmer=True)
    return {k: v.mid.fmeasure * 100 for k, v in eval_result.items()}

def compute_average_length(preds):
    lens = [np.count_nonzero(p != tokenizer.pad_token_id) for p in preds]
    return np.mean(lens)

def compute_metrics(evaluations):
    predictions, targets = evaluations

    processed_preds = decode_predictions(predictions)
    processed_targets = decode_targets(targets)
    metrics_result = calculate_metrics(processed_preds, processed_targets)

    avg_length = compute_average_length(predictions)
    metrics_result["gen_len"] = avg_length

    return {k: round(v, 4) for k, v in metrics_result.items()}

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

args = Seq2SeqTrainingArguments('text-gen-bart',
                                num_train_epochs = 3, evaluation_strategy = "epoch",
                                logging_steps = 100, learning_rate = 2e-5, weight_decay = 0.01,
                                per_device_train_batch_size = 10, per_device_eval_batch_size = 10,
                                save_total_limit = 3, predict_with_generate = True, fp16 = True)

model_trainer = Seq2SeqTrainer(model, args, train_dataset = data_train_tokenized,
                               eval_dataset = data_test_tokenized, data_collator = data_collator,
                               tokenizer = tokenizer, compute_metrics = compute_metrics)

model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [None]:
model_trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mamaanvoraa[0m ([33mnlp-amaan[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.3204,3.962089,19.4002,2.9019,19.0711,19.0778,6.6521
2,4.0305,3.851192,19.173,3.1418,18.8204,18.8276,6.7483
3,3.9579,3.822068,19.398,3.2312,19.017,19.0347,6.7755


TrainOutput(global_step=6519, training_loss=4.191124352122912, metrics={'train_runtime': 2698.4987, 'train_samples_per_second': 48.304, 'train_steps_per_second': 2.416, 'total_flos': 659920626892800.0, 'train_loss': 4.191124352122912, 'epoch': 3.0})

In [None]:
generator = pipeline("text-generation", model = model, tokenizer = tokenizer, device = 0)

idx = random.sample(range(0, len(data_test)), k=3)
sdata = [data_test[i]['Clues'] for i in idx]
textgen(generator, sdata)

Unnamed: 0,Clues,Target
0,monocot genus spanish moss bromeliaceae,monoc
1,traveler framework shoe,traveler
2,calamity,calam


In [None]:
PATH = '/kaggle/working/text-gen-bart/' + 'bart'

if not os.path.isdir(PATH):
    os.mkdir(PATH)
    model.save_pretrained(PATH)
else:
    print('Model Present')