In [1]:
dataset_id = "dat_pred"
from huggingface_hub import notebook_login
# notebook_login()

In [2]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- done
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7c50621d605c26e70d66f15376c2c53de33a6bee046def9df440ee382bce2871
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('<huggingface_key>')"

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-large"

# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [5]:
#read json
import json
import pandas as pd
from datasets import Dataset

with open('/kaggle/input/salient/test_data.json') as f:
    test = json.load(f)
with open('/kaggle/input/salient-train/chatgpt_gen_date.json') as f:
    train = json.load(f)
    
import random
random.shuffle(train)
random.shuffle(test)

print("original train size = ", len(train), ":: original test size = ", len(test))
train.extend(test[:50])
data_train = train
data_test = test[50:]
print("train size = ", len(data_train), ":: test size = ", len(data_test))

data_train_processed = []
data_test_processed = []
# base_prompt = """Given the above transcript and today's day and date, tell me after how many days will customer be able to pay?
# Return 'NA' if its not possible to infer that. Just output the number of days or 'NA' in your response and nothing else."""
base_prompts_list = {
    "label": "Given the above transcript and today's day and date, give me the date when the customer is expected to make their payment in the format 'dd/mm/yyyy'. Return 'NA' if its not possible to infer this information from the conversation. just return the date or NA and nothing else.", 
    "days_diff": "Given the above transcript and today's day and date, give me the number of days after which the customer will be able to pay. Return 0 if its not possible to infer this information from the conversation.Just return the number of days or 0 if not inferrable and nothing else."
}
label_type_list = ['label', 'days_diff']

label_type = label_type_list[1] 
base_prompt = base_prompts_list[label_type]

for d in data_train:
    data_train_processed.append(
        {
            'prompt': d['conversation'] + "\n todays date (dd/mm/yyyy) = " + d['conversation_date'] + "\n" + base_prompt,
            'label': str(d[label_type])
        }
    )
for d in data_test:
    data_test_processed.append(
        {
            'prompt': d['conversation'] + "\n todays date (dd/mm/yyyy) = " + d['conversation_date'] + "\n" + base_prompt,
            'label': str(d[label_type])
        }
    )

data_train_processed = Dataset.from_pandas(pd.DataFrame(data=data_train_processed))
data_test_processed = Dataset.from_pandas(pd.DataFrame(data=data_test_processed))

# dataset = {"train": data_train_processed, "test":data_test_processed}

original train size =  109 :: original test size =  113
train size =  159 :: test size =  63


In [6]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([data_train_processed, data_train_processed]).map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True, remove_columns=["prompt", "label"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([data_train_processed, data_train_processed]).map(lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=["prompt", "label"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

  0%|          | 0/1 [00:00<?, ?ba/s]

Max source length: 175


  0%|          | 0/1 [00:00<?, ?ba/s]

Max target length: 3


In [7]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["prompt"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = data_train_processed.map(preprocess_function, batched=True, remove_columns=["prompt", "label"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

  0%|          | 0/1 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [8]:
from transformers import AutoModelForSeq2SeqLM

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [11]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id 
repository_id = "results/" + model_id.split("/")[1]

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=7e-5,
    num_train_epochs=200,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=1000,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
# freeze the backbone and only finetune the decoder's second half.
for param in model.parameters():
    param.requires_grad = False

# train end of encoder
for param in model.encoder.block[23:].parameters(): # total 11 block in base encoder, 23 in large
    param.requires_grad = True
for param in model.encoder.final_layer_norm.parameters():
    param.requires_grad = True
    
# train end of decoder
for param in model.decoder.block[23:].parameters(): #total 11 block in base decoder, 23 in large
    param.requires_grad = True
for param in model.decoder.final_layer_norm.parameters():
    param.requires_grad = True
    
for param in model.lm_head.parameters():
    param.requires_grad = True

print("trainable_params / total_params")
print(sum(p.numel() for p in model.parameters() if p.requires_grad), "/",sum(p.numel() for p in model.parameters()))


trainable_params / total_params
62790656 / 783150080


In [13]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.796482,1.8868,0.0,1.8868,1.8868,2.0
2,No log,1.666331,4.4025,0.0,4.4025,4.4025,2.025157
3,No log,1.59852,12.5786,0.0,12.5786,12.5786,2.050314
4,No log,1.573319,12.5786,0.0,12.5786,12.5786,2.062893
5,No log,1.551305,13.8365,0.0,13.8365,13.8365,2.069182
6,No log,1.553611,11.3208,0.0,11.3208,11.3208,2.050314
7,No log,1.522587,12.5786,0.0,12.5786,12.5786,2.056604
8,No log,1.506019,15.0943,0.0,15.0943,15.0943,2.069182
9,No log,1.475803,18.8679,0.0,18.8679,18.8679,2.106918
10,No log,1.45605,20.1258,0.0,20.1258,20.1258,2.106918


TrainOutput(global_step=4000, training_loss=0.5698620491027832, metrics={'train_runtime': 4257.631, 'train_samples_per_second': 7.469, 'train_steps_per_second': 0.939, 'total_flos': 2.51939946627072e+16, 'train_loss': 0.5698620491027832, 'epoch': 200.0})

In [14]:
trainer.evaluate()

{'eval_loss': 0.0051579102873802185,
 'eval_rouge1': 100.0,
 'eval_rouge2': 0.0,
 'eval_rougeL': 100.0,
 'eval_rougeLsum': 100.0,
 'eval_gen_len': 2.1320754716981134,
 'eval_runtime': 9.2083,
 'eval_samples_per_second': 17.267,
 'eval_steps_per_second': 2.172,
 'epoch': 200.0}

In [15]:
# Save our tokenizer and create model card
tokenizer.save_pretrained("salient-ai")
model.save_pretrained("salient-ai")

In [16]:
model_name_on_hub = "Salient_ai" + model_id.split("/")[1] + "_" + label_type
tokenizer.push_to_hub(model_name_on_hub)
model.push_to_hub(model_name_on_hub)

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pratt3000/Salient_aiflan-t5-large_days_diff/commit/f3cac2b7852230c74846ee14e8b7357d79de0dd8', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='f3cac2b7852230c74846ee14e8b7357d79de0dd8', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [17]:
from transformers import AutoModel
model_path = "pratt3000/" + model_name_on_hub

model = AutoModel.from_pretrained(model_path)



Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

In [18]:
from transformers import pipeline
from random import randrange        

# load model and tokenizer from huggingface hub with pipeline
model = pipeline(model=model_path)
    
# select a random test sample
sample = data_test_processed[randrange(len(data_test_processed))]
print(f"dialogue: \n{sample['prompt']}\n---------------")

# summarize dialogue
res = model(sample["prompt"])

print(f"RESULT: \n{res}, ACTUAL: {sample['label']}")

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

dialogue: 
Agent: ...Is payment the only concern today?
Customer: Actually, no. I noticed an unfamiliar charge in my statement. Can you clarify what it's for?
 todays date (dd/mm/yyyy) = 2020-05-26, Tuesday
Given the above transcript and today's day and date, give me the number of days after which the customer will be able to pay. Return 0 if its not possible to infer this information from the conversation.Just return the number of days or 0 if not inferrable and nothing else.
---------------
RESULT: 
[{'generated_text': '0'}], ACTUAL: 0


In [19]:
model = pipeline(model = model_path)

for id, d in enumerate(data_test_processed):
    if id<50:
        continue
    
    print(d['prompt'])

    res = model(d["prompt"])

    print(f"RESULT: \n{res}, ACTUAL: {d['label']} \n\n")

    if id > 60:
        break

Agent: Hi, I'm Taylor, calling from Westlake Financial on a recorded line. Unfortunately, we did not receive your monthly payment! Would you be able to make a payment today?
Customer: yeah how much is the payment
Agent: Your monthly payment is $322.55. Are you able to make this payment today?
Customer: yeah i can make a payment sorry my wife normally does this but i yeah i can do a payment right now right now
 todays date (dd/mm/yyyy) = 2023-03-23, Thursday
Given the above transcript and today's day and date, give me the number of days after which the customer will be able to pay. Return 0 if its not possible to infer this information from the conversation.Just return the number of days or 0 if not inferrable and nothing else.
RESULT: 
[{'generated_text': '0'}], ACTUAL: 0 


Agent: Could you please specify the date when you can make the payment?
Customer: the nineth
 todays date (dd/mm/yyyy) = 2020-12-20, Sunday
Given the above transcript and today's day and date, give me the number of

In [20]:
from datetime import datetime

def date_difference_in_days(date_str1, date_str2):
    
    if label_type == "days_diff":
        return abs(int(date_str1) - int(date_str2))
    
    # Define the format of the date string
    date_format = "%Y-%m-%d"

    # Parse the date strings into datetime objects
    date1 = datetime.strptime(date_str1, date_format)
    date2 = datetime.strptime(date_str2, date_format)

    # Calculate the difference in days
    delta = date2 - date1
    return abs(delta.days)

In [21]:
from tqdm import tqdm
same = 0
cur_dist = 0
num_exceptions= 0
for id, d in tqdm(enumerate(data_train_processed)):
    res = model(d["prompt"])
    
    if res[0]['generated_text'] == str(d['label']):
        same += 1
        cur_dist += 0
    elif res[0]['generated_text'] == 'NA' or str(d['label']) == 'NA':
        cur_dist += 10 # arbitrary 10 day error added
    else:
        try:
            cur_dist += date_difference_in_days(res[0]['generated_text'], str(d['label']))
        except Exception as e:
            print(e)
            print(res[0]['generated_text'], str(d['label']))

print("ACCURACY (train) = ", same/(len(data_train_processed)-num_exceptions))
print("avg_deviation (train) = ", cur_dist/(len(data_train_processed)-num_exceptions))

from tqdm import tqdm
same = 0
cur_dist = 0
num_exceptions= 0
for id, d in tqdm(enumerate(data_test_processed)):
    res = model(d["prompt"])
    
    if res[0]['generated_text'] == str(d['label']):
        same += 1
        cur_dist += 0
    elif res[0]['generated_text'] == 'NA' or str(d['label']) == 'NA':
        print("Wrong prediction: ")
        print("Generated date", res[0]['generated_text'],":: ground truth date", str(d['label']))
        cur_dist += 10 # arbitrary 10 day error added
    else:
        try:
            print("Wrong prediction: ")
            print("Generated date", res[0]['generated_text'],":: ground truth date", str(d['label']))
            cur_dist += date_difference_in_days(res[0]['generated_text'], str(d['label']))
        except Exception as e:
            print(e)
            print("Generated date", res[0]['generated_text'],":: ground truth date", str(d['label']))

print("ACCURACY (test) = ", same/(len(data_test_processed) - num_exceptions))
print("avg_deviation (test) = ", cur_dist/(len(data_test_processed) - num_exceptions))

159it [02:10,  1.22it/s]


ACCURACY (train) =  1.0
avg_deviation (train) =  0.0


2it [00:01,  1.18it/s]

Wrong prediction: 
Generated date 9 :: ground truth date 0


3it [00:02,  1.22it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 13


4it [00:03,  1.23it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 9


5it [00:04,  1.24it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 3


6it [00:04,  1.26it/s]

Wrong prediction: 
Generated date 10 :: ground truth date 6


7it [00:05,  1.27it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 7


8it [00:06,  1.27it/s]

Wrong prediction: 
Generated date 16 :: ground truth date 15


9it [00:07,  1.26it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 31


10it [00:07,  1.28it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 1


11it [00:08,  1.28it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 8


12it [00:09,  1.29it/s]

Wrong prediction: 
Generated date 29 :: ground truth date 27


13it [00:10,  1.25it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 3


14it [00:11,  1.15it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 0


15it [00:12,  1.17it/s]

Wrong prediction: 
Generated date 5 :: ground truth date 4


16it [00:13,  1.19it/s]

Wrong prediction: 
Generated date 16 :: ground truth date 39


17it [00:13,  1.21it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 12


20it [00:16,  1.20it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 2


21it [00:17,  1.23it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 9


22it [00:17,  1.26it/s]

Wrong prediction: 
Generated date 30 :: ground truth date 37


24it [00:19,  1.22it/s]

Wrong prediction: 
Generated date 29 :: ground truth date 22


26it [00:21,  1.23it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 7


27it [00:22,  1.25it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 7


28it [00:22,  1.24it/s]

Wrong prediction: 
Generated date 29 :: ground truth date 38


29it [00:23,  1.26it/s]

Wrong prediction: 
Generated date 16 :: ground truth date 11


32it [00:25,  1.28it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 0


38it [00:31,  1.16it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 7


43it [00:35,  1.18it/s]

Wrong prediction: 
Generated date 21 :: ground truth date 4


44it [00:36,  1.22it/s]

Wrong prediction: 
Generated date 4 :: ground truth date 8


52it [00:44,  1.01it/s]

Wrong prediction: 
Generated date 9 :: ground truth date 0


53it [00:44,  1.07it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 1


54it [00:45,  1.11it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 20


56it [00:47,  1.12it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 3


58it [00:49,  1.15it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 5


59it [00:50,  1.19it/s]

Wrong prediction: 
Generated date 6 :: ground truth date 11


62it [00:52,  1.14it/s]

Wrong prediction: 
Generated date 2 :: ground truth date 29


63it [00:53,  1.17it/s]

Wrong prediction: 
Generated date 0 :: ground truth date 1
ACCURACY (test) =  0.42857142857142855
avg_deviation (test) =  3.7936507936507935



