In [1]:
%pip install py7zr

Collecting py7zr
  Downloading py7zr-0.20.5-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting brotli>=1.0.9
  Downloading Brotli-1.0.9-cp39-cp39-manylinux1_x86_64.whl (357 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.2/357.2 kB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyzstd>=0.14.4
  Downloading pyzstd-0.15.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (399 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.3/399.3 kB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pybcj>=0.6.0
  Downloading pybcj-1.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multivolumefile>=0.2.3
  Downloading multivolumefile-0.2.3-py3-none-any.whl (17 kB)
Collec

In [3]:
%pip install --upgrade transformers accelerate datasets pandas scikit-learn scipy  nltk rouge-score evaluate


[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
import pandas as pd 
nltk.download("punkt")

model_id="google/flan-t5-large"
repository_id = 'claradata/emailface-flan-t5-large'


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [6]:

df = pd.read_parquet('dataset.parquet')
df['is_junk'] = df.apply(lambda x: all([x[k] == False for k in ['is_travel','is_ad', 'is_virtual']])  and all([x[k]=='N/A' for k in ['who','where','who_from','when','summary']]), axis=1)
df = df[~df['is_junk']]

#build dataset : 
questions = [
     'Text : \n\n %s. \n\n Yes or No - Does this document contain confirmation or planning for a reservation, booking, flight, hotel, or trip?',
     'Text : \n\n %s. \n\n What date and time is the reservatioan?',
     'Text : \n\n %s. \n\n What location is the reservation for?',
     'Text : \n\n %s. \n\n Who is the reservation for?',
     'Text : \n\n %s. \n\n Who is the message from?',
     'Text : \n\n %s. \n\n Summarize the text in one sentence in English: ',
     'Text : \n\n %s. \n\n Yes or no - is this a mass promotional message, or concerning a discount or offer from a store?',
     'Text : \n\n %s. \n\n Yes or no - is this about a virtual event?'
]
q_index = ['is_travel','when','where','who','who_from','summary','is_ad','is_virtual']
prompt_dataset = []
for row in df.to_dict('records') : 
    for i,q in enumerate(questions) : 
        answer = row[q_index[i]] 
        if q_index[i].startswith('is_') : 
            answer = 'Yes' if answer else 'No' 
        rec = {
            'text' : q % row['content'][:1250] ,
            'answer' : answer 
        }
        prompt_dataset.append(rec) 

df = pd.DataFrame(prompt_dataset)
df = df[df['answer']!='N/A']
df = df[df['answer']!='N/A.']

ds = Dataset.from_pandas(df, preserve_index=False) 
ds = ds.train_test_split(test_size=0.2 , shuffle=True, seed=42, load_from_cache_file=True)



In [7]:

def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=64, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = ds.map(preprocess_function, batched=True, remove_columns=['text','answer'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/48439 [00:00<?, ? examples/s]

Map:   0%|          | 0/12110 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map={'':0})

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=1,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
trainer.train() 

tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
trainer.push_to_hub('emailface-flan-t5-large')





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4124,0.357396,88.0408,22.5033,87.5886,87.5677,5.278943


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:

trainer.push_to_hub('emailface-flan-t5-large')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/notebooks/claradata/emailface-flan-t5-large is already a clone of https://huggingface.co/claradata/emailface-flan-t5-large. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [20]:
trainer.save_model('emailface-flan-t5-large') 


('emailface-flat-5-large/tokenizer_config.json',
 'emailface-flat-5-large/special_tokens_map.json',
 'emailface-flat-5-large/spiece.model',
 'emailface-flat-5-large/added_tokens.json',
 'emailface-flat-5-large/tokenizer.json')

In [22]:
tokenizer.save_pretrained('emailface-flan-t5-large') 

('emailface-flan-t5-large/tokenizer_config.json',
 'emailface-flan-t5-large/special_tokens_map.json',
 'emailface-flan-t5-large/spiece.model',
 'emailface-flan-t5-large/added_tokens.json',
 'emailface-flan-t5-large/tokenizer.json')

In [26]:
pwd

'/notebooks'

In [29]:
from transformers import pipeline
emailface = pipeline(model = '/notebooks/emailface-flan-t5-large/', device=0, task='text2text-generation')


In [44]:
text = '''
'''
questions = [
     'Text : \n\n %s. \n\n Yes or No - Does this document contain confirmation or planning for a reservation, booking, flight, hotel, or trip?',
     'Text : \n\n %s. \n\n What date and time is the reservatioan?',
     'Text : \n\n %s. \n\n What location is the reservation for?',
     'Text : \n\n %s. \n\n Who is the reservation for?',
     'Text : \n\n %s. \n\n Who is the message from?',
     'Text : \n\n %s. \n\n Summarize the text in one sentence in English: ',
     'Text : \n\n %s. \n\n Yes or no - is this a mass promotional message, or concerning a discount or offer from a store?',
     'Text : \n\n %s. \n\n Yes or no - is this about a virtual event?'
]

qs = [q % text[:1500] for q in questions]

summary = emailface(qs,max_new_tokens=64)



Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `ma

In [45]:
summary


[{'generated_text': 'Yes'},
 {'generated_text': 'Tue 27 June 2023 from 08:00 To 16:00.'},
 {'generated_text': 'The Vobster Team.'},
 {'generated_text': 'Brady Dice.'},
 {'generated_text': 'The Vobster Team'},
 {'generated_text': "Brady Dice's booking for Tue 27 June 2023 from 08:00 To 16:00 has been confirmed."},
 {'generated_text': 'No'},
 {'generated_text': 'No'}]

In [47]:
model = AutoModelForSeq2SeqLM.from_pretrained('/notebooks/emailface-flan-t5-large/') 

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/notebooks/emailface-flan-t5-large/') 

In [None]:
tokenizer.push_to_hub('emailface-flan-t5-large') 

In [None]:
model.push_to_hub('emailface-flan-t5-large')



pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]