In [0]:
# Standard Libraries
import re
import pickle
import mlflow

# Data Libraries
import pandas as pd
import numpy as np
import swifter

# Data Preprocessing
import nltk
from nltk.util import ngrams
from cleantext import clean

# Metrics
from rouge import Rouge 

# Data Visualisation
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

# To Call the API
import requests
import json

# Functionality
from typing import List, Dict, Union

from datasets import load_dataset
from datasets import Dataset

import evaluate
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import huggingface_hub

## 1. Login to Huggingface API to save your model

In [0]:
huggingface_hub.login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


# 2. Curate data for fine-tuning

In [0]:
dataset_name   = "billsum"
dataset = load_dataset(dataset_name)
dataset



  0%|          | 0/3 [00:00<?, ?it/s]

Out[7]: DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [0]:
dataset["train"][0]

Out[8]: {'text': "SECTION 1. LIABILITY OF BUSINESS ENTITIES PROVIDING USE OF FACILITIES \n              TO NONPROFIT ORGANIZATIONS.\n\n    (a) Definitions.--In this section:\n            (1) Business entity.--The term ``business entity'' means a \n        firm, corporation, association, partnership, consortium, joint \n        venture, or other form of enterprise.\n            (2) Facility.--The term ``facility'' means any real \n        property, including any building, improvement, or appurtenance.\n            (3) Gross negligence.--The term ``gross negligence'' means \n        voluntary and conscious conduct by a person with knowledge (at \n        the time of the conduct) that the conduct is likely to be \n        harmful to the health or well-being of another person.\n            (4) Intentional misconduct.--The term ``intentional \n        misconduct'' means conduct by a person with knowledge (at the \n        time of the conduct) that the conduct is harmful to the health \n    

In [0]:
# to pandas df
data_train = pd.DataFrame(dataset['train'])[["text", "summary"]]   #.sample(n=32, random_state=1)
data_val   = pd.DataFrame(dataset['test'])[["text", "summary"]]    #.sample(n=32, random_state=1)
data_test  = pd.DataFrame(dataset['ca_test'])[["text", "summary"]] #.sample(n=32, random_state=1)

print(data_train.shape)
data_train.head()

(18949, 2)


Unnamed: 0,text,summary
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...


In [0]:
def replacer(match):
  """ If there is a match - we add a blank space to the right or to the left of the match.

  Example usage:
    match = <re.Match object; span=(0, 1), match='('>
    return: '( '

    match = <re.Match object; span=(0, 1), match=')'>
    return: ' )'
  """

  if match.group(1) is not None:
    return '{} '.format(match.group(1))
  return ' {}'.format(match.group(2))

def clean_text(txt:str)->str:
  """ Takes text as input, matches punctuation and spaces it out from the word.

  Depending on the format of the punctuation - if there is a space before it or after it, 
  the original string is transformed. In the string 'March 18 (Reuters) - Hotel revenue'
  we see that there is a space before the first bracket '18 (Reuters', therefore a space 
  will be added after the bracket as well '18 ( reuters'. 

  txt: String text. 

  Example usage:
    txt = 'March 18 (Reuters) - Hotel revenue slumped globally in February'
    return: 'march 18 ( reuters ) -  hotel revenue slumped globally in february'

  return: Lowercased, cleaned text with spaced out punctuation.
  """

  rx = re.compile(r'^(\W+)|(\W+)$')
  return clean(" ".join([rx.sub(replacer, word) for word in txt.split()]).lower())

In [0]:
data_train['text_processed']    = data_train.text.swifter.apply(clean_text)
data_train['summary_processed'] = data_train.summary.swifter.apply(clean_text)
data_train['text']              = data_train['text_processed']
data_train['summary']           = data_train['summary_processed']
data_train                      = data_train[["text", "summary"]]

data_test['text_processed']     = data_test.text.swifter.apply(clean_text)
data_test['summary_processed']  = data_test.summary.swifter.apply(clean_text)
data_test['text']               = data_test['text_processed']
data_test['summary']            = data_test['summary_processed']
data_test                       = data_test[["text", "summary"]]

data_val['text_processed']      = data_val.text.swifter.apply(clean_text)
data_val['summary_processed']   = data_val.summary.swifter.apply(clean_text)
data_val['text']                = data_val['text_processed']
data_val['summary']             = data_val['summary_processed']
data_val                        = data_val[["text", "summary"]]

Pandas Apply:   0%|          | 0/18949 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/18949 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/32 [00:00<?, ?it/s]

In [0]:
data_val   = Dataset.from_pandas(data_val)
data_test  = Dataset.from_pandas(data_test)
data_train = Dataset.from_pandas(data_train)

In [0]:
data_train['summary'][0]

Out[13]: "shields a business entity from civil liability relating to any injury or death occurring at a facility of that entity in connection with a use of such facility by a nonprofit organization if : ( 1 ) the use occurs outside the scope of business of the business entity ; ( 2 ) such injury or death occurs during a period that such facility is used by such organization ; and ( 3 ) the business entity authorized the use of such facility by the organization . makes this act inapplicable to an injury or death that results from an act or omission of a business entity that constitutes gross negligence or intentional misconduct , including misconduct that : ( 1 ) constitutes a hate crime or a crime of violence or act of international terrorism for which the defendant has been convicted in any court ; or ( 2 ) involves a sexual offense for which the defendant has been convicted in any court or misconduct for which the defendant has been found to have violated a federal or state civil rig

# 2. Train

### 2.1 Tokenize: words to tokens

In [0]:
from transformers import AutoTokenizer

model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [0]:
inputs = tokenizer(text = "I loved reading the Hunger Games!")
inputs

Out[15]: {'input_ids': [0, 100, 2638, 2600, 5, 27689, 3100, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [0]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

Out[17]: ['<s>', 'I', 'Ġloved', 'Ġreading', 'Ġthe', 'ĠHunger', 'ĠGames', '!', '</s>']

In [0]:
def preprocess_function(examples):

    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [0]:
data_train = data_train.map(preprocess_function, batched=True)
data_test  = data_test.map(preprocess_function, batched=True)
data_val   = data_val.map(preprocess_function, batched=True)

data_train

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Out[19]: Dataset({
    features: ['text', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 18949
})

### 2.2 Define evaluation function

In [0]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### 2.3 Define model and data-collator

In [0]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

### 2.4 Call model train

In [0]:
model_name = "bart-large-cnn-billsum"
mlflow.pytorch.autolog(
    log_input_examples=False,
    log_model_signatures=False,
    log_models=False,
    disable=True,
    exclusive=True,
    disable_for_unsupported_versions=True,
    silent=True
)

In [0]:
training_args = Seq2SeqTrainingArguments(
    output_dir=model_name,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, summary, text. If __index_level_0__, summary, text are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 32
  Num Epochs = 4
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 64


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.763327,0.5003,0.2393,0.3014,0.3026,140.6562
2,No log,1.30888,0.5415,0.2758,0.3291,0.3305,141.7812
3,No log,1.046858,0.5596,0.3031,0.3379,0.3376,139.875
4,No log,0.907518,0.5725,0.3128,0.3502,0.3492,141.3438


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, summary, text. If __index_level_0__, summary, text are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 2
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, summary, text. If __index_level_0__, summary, text are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 2
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, summary, text. If __index_level_0__, summary, text are not expected by `Bart

# 3. Save model

In [0]:
# trainer.push_to_hub(model_name, use_auth_token="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")
# model.push_to_hub(model_name, use_temp_dir=True)
# tokenizer.push_to_hub(model_name, use_temp_dir=True)

### 3.1 Save model to local disk

In [0]:
model.save_pretrained(f"saved_{model_name}", repo_name=model_name)

Configuration saved in saved_bart-large-cnn-billsum/config.json
Model weights saved in saved_bart-large-cnn-billsum/pytorch_model.bin


In [0]:
tokenizer_files_to_save = tokenizer.save_pretrained(f"saved_{model_name}", repo_name=model_name)
tokenizer_files_to_save

tokenizer config file saved in saved_bart-large-cnn-billsum/tokenizer_config.json
Special tokens file saved in saved_bart-large-cnn-billsum/special_tokens_map.json
Out[25]: ('saved_bart-large-cnn-billsum/tokenizer_config.json',
 'saved_bart-large-cnn-billsum/special_tokens_map.json',
 'saved_bart-large-cnn-billsum/vocab.json',
 'saved_bart-large-cnn-billsum/merges.txt',
 'saved_bart-large-cnn-billsum/added_tokens.json',
 'saved_bart-large-cnn-billsum/tokenizer.json')

### 3.2 install git-lfs

In [0]:
!sudo apt-get install git-lfs
!git lfs install --skip-repo

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 4%Reading package lists... 4%Reading package lists... 5%Reading package lists... 5%Reading package lists... 48%Reading package lists... 48%Reading package lists... 48%Reading package lists... 48%Reading package lists... 60%Reading package lists... 60%Reading package lists... 63%Reading package lists... 69%Reading package lists... 69%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 74%Reading package lists... 84%Reading package lists... 84%Reading package lists... 92%Reading package lists... 92%Reading package lists... 96%Reading package lists... 96%Reading package lists... 96%Reading package lists... 96%Reading package lists... 98%Reading package 

In [0]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id=model_name)

# RepoUrl('https://huggingface.co/sapthrishi007/bart-large-cnn-billsum', endpoint='https://huggingface.co', repo_type='model', repo_id='sapthrishi007/bart-large-cnn-billsum')

### 3.3 Upload to Huggingface model hub

In [0]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_file(
    path_or_fileobj=f"saved_{model_name}/config.json",
    path_in_repo="config.json",
    repo_id=f"sapthrishi007/{model_name}",
)

Out[29]: 'https://huggingface.co/sapthrishi007/bart-large-cnn-billsum/blob/main/config.json'

In [0]:
api.upload_file(
    path_or_fileobj=f"saved_{model_name}/pytorch_model.bin",
    path_in_repo="pytorch_model.bin",
    repo_id=f"sapthrishi007/{model_name}",
)

Out[30]: 'https://huggingface.co/sapthrishi007/bart-large-cnn-billsum/blob/main/pytorch_model.bin'

In [0]:
for tok_file in tokenizer_files_to_save:
    if "added_tokens.json" in tok_file:
        continue
    
    api.upload_file(
        path_or_fileobj = tok_file,
        path_in_repo    = tok_file.split("/")[1],
        repo_id         = f"sapthrishi007/{model_name}",
    )

# 4. Score

In [0]:
from transformers import pipeline

model_name   = "sapthrishi007/bart-large-cnn-billsum"

summarizer   = pipeline("summarization", model=model_name)

loading configuration file https://huggingface.co/sapthrishi007/bart-large-cnn-billsum/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8038f16b1cca45c2e7b3b007ea262551609ce0dafa4baebf2588b54453a92892.f3e34561a5941e2e7d8f7bd3b39329501c6c5ebab51e82a90707cde99be7d3a6
Model config BartConfig {
  "_name_or_path": "sapthrishi007/bart-large-cnn-billsum",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  

In [0]:
input_text_1 = """Orchestration section right won't be decided for the example. Note we'll keep there is a there is currently a readme and then like feature notebook and model notebook.  Uh validations plus score, right? So we have these notebooks. And then  Actually, there are two different separate notebooks. So in the when we when we deploy this in an MLA environment, right? But we we decided that the way we will make users run these notebooks is through an orchestration job, right?  Uh-huh.  Like a like a job or a A just wanted wanted to know the plan. Are we? Are we achieving that in this 1/4 we are not? No, no. OK. So we, the community, the conversation I had with John and Lakshmi yesterday, that this quarter if we can just push this notebooks out there and if users are able to use it by are you reading going through the README file, create the silver table and then, you know, set up their own jobs. Yeah. Uh, then that is what we would do instead of like, this can be. And once we learn from that, then maybe in the next quarter we can optimize this process to have job orchestration in databricks. Maybe we can bring data pipeline or we might have to change data pipeline little but like we can do that as well. To. So that can be the future quarter effort because it will be right now if we do a job, OK, station, it would be still be like users creating their own silver table and then running the job orchestration to run the feature model build and model pipeline makes sense. To predict what we were. Yeah.  What we were thinking is in our orchestration section in the README notebook, we'll actually give them instructions on how to create a job and use a workflow just like how our data scientists are doing. Right. OK. They they create that workflow and we can have an example workflow already created in the environment where they can go see that workflow and then they create they can create themselves. OK. They. Yeah. Well, they won't see the workflow because we would demo it right in that integration. Yeah, read demo it. We demo it. We we would actually like in that room, but they're doing a video recording like we've done for other MLAs we would show them how to build to Lakshmi's point how to build a workflow which they could build on their own. Because what the the reality is, is that that there's a there's a notebook that happens in between feature, right. But prior to feature which is gonna be them creating the data sets in the format that we need, right? No. So they're basically creating the silver tables that we require and then we we don't know what that's at. We don't know what that's gonna be like cause they can onboard first party data in whatever format it's possible, right?"""

In [0]:
summarizer(input_text_1)

Out[34]: [{'summary_text': 'There is a there is currently a readme and then like feature notebook and model notebook and then validations plus score, right? And then  there are two different separate notebooks in the when we when we deploy this in an MLA environment. But we we decided that the way we will make users run these notebooks is through an orchestration job. Like a like a job or a A just wanted to know the plan.'}]