In [2]:
%pip install --upgrade transformers accelerate datasets pandas scikit-learn scipy  nltk rouge-score evaluate py7zr ninja nvidia-ml-py3

Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
Collecting pandas
  Downloading pandas-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.

In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
torch.backends.cuda.matmul.allow_tf32 = True

import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
import pandas as pd 
nltk.download("punkt")

model_id="google/flan-t5-large"
repository_id = 'claradata/emailface-flan-t5-large-multilingual'


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [3]:

df = pd.read_parquet('/storage/datasets/translated.parquet')
df['is_junk'] = df.apply(lambda x: all([x[k] == False for k in ['is_travel','is_ad', 'is_virtual']])  and all([x[k]=='N/A' for k in ['who','where','who_from','when','summary']]), axis=1)
df = df[~df['is_junk']]

#build dataset : 
questions = [
     'Text : \n\n %s. \n\n Yes or No - Does this document contain confirmation or planning for a reservation, booking, flight, hotel, or trip?',
     'Text : \n\n %s. \n\n What date and time is the reservatioan?',
     'Text : \n\n %s. \n\n What location is the reservation for?',
     'Text : \n\n %s. \n\n Who is the reservation for?',
     'Text : \n\n %s. \n\n Who is the message from?',
     'Text : \n\n %s. \n\n Summarize the text in one sentence in English: ',
     'Text : \n\n %s. \n\n Yes or no - is this a mass promotional message, or concerning a discount or offer from a store?',
     'Text : \n\n %s. \n\n Yes or no - is this about a virtual event?'
]
q_index = ['is_travel','when','where','who','who_from','summary','is_ad','is_virtual']
prompt_dataset = []
for row in df.to_dict('records') : 
    for i,q in enumerate(questions) : 
        answer = row[q_index[i]] 
        if q_index[i].startswith('is_') : 
            answer = 'Yes' if answer else 'No' 
        rec = [ {
            'text' : q % row[key][:1250] ,
            'answer' : answer 
        } for key in [a for a in row.keys() if 'content' in a ]]
        prompt_dataset.extend(rec) 

df = pd.DataFrame(prompt_dataset).drop_duplicates()
df = df[df['answer']!='N/A']
df = df[df['answer']!='N/A.']
# df = df.sample(int(len(df)*0.6))

ds = Dataset.from_pandas(df, preserve_index=False) 
ds = ds.train_test_split(test_size=0.2 , shuffle=True, seed=42, load_from_cache_file=True)



In [4]:

def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=64, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = ds.map(preprocess_function, batched=True, remove_columns=['text','answer'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/156654 [00:00<?, ? examples/s]

Map:   0%|          | 0/39164 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id,  device_map={'':0})

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    tf32=True, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=1,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


In [11]:
trainer.train() 



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.8555,0.723648,77.882,15.7215,77.4395,77.4525,5.252324


TrainOutput(global_step=7833, training_loss=0.9700805827695627, metrics={'train_runtime': 19019.1191, 'train_samples_per_second': 8.237, 'train_steps_per_second': 0.412, 'total_flos': 3.610511185721426e+17, 'train_loss': 0.9700805827695627, 'epoch': 1.0})

In [12]:
trainer.save_model('/storage/models/emailface-flan-t5-large-multilingual') 
tokenizer.save_pretrained('/storage/models/emailface-flan-t5-large-multilingual') 

('/storage/models/emailface-flan-t5-large-multilingual/tokenizer_config.json',
 '/storage/models/emailface-flan-t5-large-multilingual/special_tokens_map.json',
 '/storage/models/emailface-flan-t5-large-multilingual/spiece.model',
 '/storage/models/emailface-flan-t5-large-multilingual/added_tokens.json',
 '/storage/models/emailface-flan-t5-large-multilingual/tokenizer.json')

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained('/storage/models/emailface-flan-t5-large-multilingual-tf32/') 
tokenizer = AutoTokenizer.from_pretrained('/storage/models/emailface-flan-t5-large-multilingual-tf32/') 
tokenizer.push_to_hub('emailface-flan-t5-large-multilingual-tf32') 
model.push_to_hub('emailface-flan-t5-large-multilingual-tf32')



pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/claradata/emailface-flan-t5-large-multilingual-tf32/commit/429a68da85b0c58ef6f43fdcafea43845d68c34e', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='429a68da85b0c58ef6f43fdcafea43845d68c34e', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
from transformers import pipeline
emailface = pipeline(model = 'claradata/emailface-flan-t5-large-multilingual-tf32', device=0, task='text2text-generation')

Downloading (…)lve/main/config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
text = '''
Brady, your order is reserved
You haven’t paid anything yet. See below for when your card will be automatically charged.

To manage your booking on the go, download our app.

June 19, 2023 at 4:45 AM

Warsaw: Full-day Tour to Krakow and Auschwitz by Train

1 Adult (Age 0 - 99) • 15 hours

£163.65

See activity details

Warsaw: Full-day Tour to Krakow and Auschwitz by Train
tickets_large
When you'll get your tickets
Your tickets will be available in the app once your payment of £163.65 is processed on June 16, 2023. You can also pay early if you prefer.

Go to app
checkmark-out-circle_large
Important information
What to bring
Passport or ID card
Comfortable shoes
Not allowed
Luggage or large bags
Know before you go
• This tour is not recommended for people with limited mobility
• The time of pickup depends on the pickup location and the departure time of the train which is subject to change. The exact time will be confirmed a day before the tour
• As per the requirements of Auschwitz-Birkenau Memorial and Museum, all participants are required to provide their full name and contact details as part of the booking
• Entrance may be refused if the name provided on the booking is not identical to the name on the ID provided when entering
• Owing to the requirements, all tickets to the museum are non-refundable. Please consider your purchase carefully
• The pace and duration of the tours are determined by the memorial's visitor service. Unfortunately, GetYourGuide and your guide have no influence on the duration of the break times

pin_large
What to do on the day
exclamation-in-circle_large	
The activity provider will email or phone you with full pickup info the day before the activity.

Pickup location
Sheraton Grand, Bolesława Prusa, Warsaw, Poland

• Please wait in the hotel lobby 10 minutes before your scheduled pickup time. The driver will be holding a sign with your last name on it
• Drivers will wait no longer than 5 minutes after the scheduled pickup time

Where your activity ends
You’ll be dropped off at the same place you were picked up.

Same as the pickup location

calendar-with-arrow_large
Cancellation policy
Cancel before 4:45 AM on June 18th for a full refund

Manage your booking
question-in-circle_large
Your activity provider is AB Everest Travel
Contact them if you have any questions about your activity.

+48 722 261 731
Email
Booking reference
GYGZG3HRBHLV

PIN
KfKu9tQy

Continue your trip in our app
tickets_white_large
Access your tickets offline

chat-bubble_white_large
Get support in just 1 minute

bell_white_large
Get essential updates on the go
'''
questions = [
     'Text : \n\n %s. \n\n Yes or No - Does this document contain confirmation or planning for a reservation, booking, flight, hotel, or trip?',
     'Text : \n\n %s. \n\n What date and time is the reservatioan?',
     'Text : \n\n %s. \n\n What location is the reservation for?',
     'Text : \n\n %s. \n\n Who is the reservation for?',
     'Text : \n\n %s. \n\n Who is the message from?',
     'Text : \n\n %s. \n\n Summarize the text in one sentence in English: ',
     'Text : \n\n %s. \n\n Yes or no - is this a mass promotional message, or concerning a discount or offer from a store?',
     'Text : \n\n %s. \n\n Yes or no - is this about a virtual event?'
]

qs = [q % text[:1250] for q in questions]

summary = emailface(qs,max_new_tokens=64, temperature=0.)



Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=64) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `ma

In [8]:
summary


[{'generated_text': 'Yes'},
 {'generated_text': 'The reservation is for June 19, 2023 at 4:45 AM.'},
 {'generated_text': 'The locations involved are Warsaw and Krakow.'},
 {'generated_text': 'The reservation is for 1 adult.'},
 {'generated_text': 'The message is from Booking.com.'},
 {'generated_text': 'This document is a confirmation for a train ticket from Warsaw to Krakow and Auschwitz.'},
 {'generated_text': 'No'},
 {'generated_text': 'No'}]