In [None]:
dataset_id = "dat_pred"
from huggingface_hub import notebook_login
# notebook_login()

In [None]:
!pip install evaluate
!pip install rouge_score

In [None]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('<huggingface_key>')"

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-large"

# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
#read json
import json
import pandas as pd
from datasets import Dataset

with open('/kaggle/input/salient/test_data.json') as f:
    test = json.load(f)
with open('/kaggle/input/salient-train/chatgpt_gen_date.json') as f:
    train = json.load(f)
    
import random
random.shuffle(train)
random.shuffle(test)

print("original train size = ", len(train), ":: original test size = ", len(test))
train.extend(test[:50])
data_train = train
data_test = test[50:]
print("train size = ", len(data_train), ":: test size = ", len(data_test))

data_train_processed = []
data_test_processed = []
# base_prompt = """Given the above transcript and today's day and date, tell me after how many days will customer be able to pay?
# Return 'NA' if its not possible to infer that. Just output the number of days or 'NA' in your response and nothing else."""
base_prompts_list = {
    "label": "Given the above transcript and today's day and date, give me the date when the customer is expected to make their payment in the format 'dd/mm/yyyy'. Return 'NA' if its not possible to infer this information from the conversation. just return the date or NA and nothing else.", 
    "days_diff": "Given the above transcript and today's day and date, give me the number of days after which the customer will be able to pay. Return 0 if its not possible to infer this information from the conversation.Just return the number of days or 0 if not inferrable and nothing else."
}
label_type_list = ['label', 'days_diff']

label_type = label_type_list[1] 
base_prompt = base_prompts_list[label_type]

for d in data_train:
    data_train_processed.append(
        {
            'prompt': d['conversation'] + "\n todays date (dd/mm/yyyy) = " + d['conversation_date'] + "\n" + base_prompt,
            'label': str(d[label_type])
        }
    )
for d in data_test:
    data_test_processed.append(
        {
            'prompt': d['conversation'] + "\n todays date (dd/mm/yyyy) = " + d['conversation_date'] + "\n" + base_prompt,
            'label': str(d[label_type])
        }
    )

data_train_processed = Dataset.from_pandas(pd.DataFrame(data=data_train_processed))
data_test_processed = Dataset.from_pandas(pd.DataFrame(data=data_test_processed))

# dataset = {"train": data_train_processed, "test":data_test_processed}

In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([data_train_processed, data_train_processed]).map(lambda x: tokenizer(x["prompt"], truncation=True), batched=True, remove_columns=["prompt", "label"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([data_train_processed, data_train_processed]).map(lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=["prompt", "label"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["prompt"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = data_train_processed.map(preprocess_function, batched=True, remove_columns=["prompt", "label"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")

In [None]:
from transformers import AutoModelForSeq2SeqLM

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id 
repository_id = "results/" + model_id.split("/")[1]

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=7e-5,
    num_train_epochs=200,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=1000,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# freeze the backbone and only finetune the decoder's second half.
for param in model.parameters():
    param.requires_grad = False

# train end of encoder
for param in model.encoder.block[23:].parameters(): # total 11 block in base encoder, 23 in large
    param.requires_grad = True
for param in model.encoder.final_layer_norm.parameters():
    param.requires_grad = True
    
# train end of decoder
for param in model.decoder.block[23:].parameters(): #total 11 block in base decoder, 23 in large
    param.requires_grad = True
for param in model.decoder.final_layer_norm.parameters():
    param.requires_grad = True
    
for param in model.lm_head.parameters():
    param.requires_grad = True

print("trainable_params / total_params")
print(sum(p.numel() for p in model.parameters() if p.requires_grad), "/",sum(p.numel() for p in model.parameters()))


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained("salient-ai")
model.save_pretrained("salient-ai")

In [None]:
model_name_on_hub = "Salient_ai" + model_id.split("/")[1] + "_" + label_type
tokenizer.push_to_hub(model_name_on_hub)
model.push_to_hub(model_name_on_hub)

# Inference

In [None]:
from transformers import AutoModel
model_path = "pratt3000/" + model_name_on_hub

model = AutoModel.from_pretrained(model_path)



In [None]:
from transformers import pipeline
from random import randrange        

# load model and tokenizer from huggingface hub with pipeline
model = pipeline(model=model_path)
    
# select a random test sample
sample = data_test_processed[randrange(len(data_test_processed))]
print(f"dialogue: \n{sample['prompt']}\n---------------")

# summarize dialogue
res = model(sample["prompt"])

print(f"RESULT: \n{res}, ACTUAL: {sample['label']}")

In [None]:
model = pipeline(model = model_path)

for id, d in enumerate(data_test_processed):
    if id<50:
        continue
    
    print(d['prompt'])

    res = model(d["prompt"])

    print(f"RESULT: \n{res}, ACTUAL: {d['label']} \n\n")

    if id > 60:
        break

In [None]:
from datetime import datetime

def date_difference_in_days(date_str1, date_str2):
    
    if label_type == "days_diff":
        return abs(int(date_str1) - int(date_str2))
    
    # Define the format of the date string
    date_format = "%Y-%m-%d"

    # Parse the date strings into datetime objects
    date1 = datetime.strptime(date_str1, date_format)
    date2 = datetime.strptime(date_str2, date_format)

    # Calculate the difference in days
    delta = date2 - date1
    return abs(delta.days)

In [None]:
from tqdm import tqdm
same = 0
cur_dist = 0
num_exceptions= 0
for id, d in tqdm(enumerate(data_train_processed)):
    res = model(d["prompt"])
    
    if res[0]['generated_text'] == str(d['label']):
        same += 1
        cur_dist += 0
    elif res[0]['generated_text'] == 'NA' or str(d['label']) == 'NA':
        cur_dist += 10 # arbitrary 10 day error added
    else:
        try:
            cur_dist += date_difference_in_days(res[0]['generated_text'], str(d['label']))
        except Exception as e:
            print(e)
            print(res[0]['generated_text'], str(d['label']))

print("ACCURACY (train) = ", same/(len(data_train_processed)-num_exceptions))
print("avg_deviation (train) = ", cur_dist/(len(data_train_processed)-num_exceptions))

from tqdm import tqdm
same = 0
cur_dist = 0
num_exceptions= 0
for id, d in tqdm(enumerate(data_test_processed)):
    res = model(d["prompt"])
    
    if res[0]['generated_text'] == str(d['label']):
        same += 1
        cur_dist += 0
    elif res[0]['generated_text'] == 'NA' or str(d['label']) == 'NA':
        print("Wrong prediction: ")
        print("Generated date", res[0]['generated_text'],":: ground truth date", str(d['label']))
        cur_dist += 10 # arbitrary 10 day error added
    else:
        try:
            print("Wrong prediction: ")
            print("Generated date", res[0]['generated_text'],":: ground truth date", str(d['label']))
            cur_dist += date_difference_in_days(res[0]['generated_text'], str(d['label']))
        except Exception as e:
            print(e)
            print("Generated date", res[0]['generated_text'],":: ground truth date", str(d['label']))

print("ACCURACY (test) = ", same/(len(data_test_processed) - num_exceptions))
print("avg_deviation (test) = ", cur_dist/(len(data_test_processed) - num_exceptions))