In [None]:
!pip install datasets
!pip install tensorflow-hub

import pandas as pd

import email
from email import policy

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer,  BertTokenizer, BertForSequenceClassification
from datasets import load_dataset, Dataset
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import tensorflow_hub as hub

import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download("book")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split

from collections import defaultdict
import unicodedata
import string
import re
import os




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    | 

In [None]:
#run just if you are running in colab abd have data in google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#drive or local dir for datasets and results
DIR = "drive/MyDrive/nlp_project/"

### Helper functions for load, process and filter the datsets

In [None]:
def getBodyFromContent(text):
    chunks = []
    for chunk in text.walk():
        if chunk.get_content_type() == "text/plain":
            chunks.append(chunk.get_payload())
    return "".join(chunks).strip()

def clean_subject(text):
    bad_strings = ["FW:", "Fw:", "fw:", "FW", "Fw", "fw", "RE:", "Re:", "re:", "RE", "Re", "re"]
    for bs in bad_strings:
        text = text.replace(bs, "")
    return text

def length_filter(text):
    """
    Returns True if the # of words of the text is greater than 1.
    """
    return len(text.strip().split()) > 1

def chars_filter(text):
    """
    Returns True if the text contains at least one character, False otherwise
    """
    return any([c.isalpha() for c in text])

def no_subject_filter(text):
    return "no subject" not in text.lower()

def subject_filters(text):
    """
    Returns True if the text passes both length and char filters.
    """
    return length_filter(text) and chars_filter(text) and no_subject_filter(text)

In [None]:
def getSignatures(path):
    segments = pd.DataFrame()
    signatures = []

    for filename in os.listdir(path):
        if filename.endswith("body"):
            with open(f"{path}/{filename}") as f:
                lines = f.readlines()
                signature = []
                body = []
                for line in reversed(lines):
                    if line.strip() == "":
                        continue
                    elif line.startswith("#sig#"):
                        signature.insert(0, line[5:].strip())
                    else:
                        body.insert(0, line.strip())
                signature = "\n".join(signature)
                body = "\n".join(body)

                signatures.append(signature)

                body_sentences = nltk.sent_tokenize(body)

                segments = pd.concat([segments, pd.DataFrame({
                    "value": [signature],
                    "label": ["signature"]
                })])

                segments = pd.concat([segments, pd.DataFrame({
                    "value": body_sentences,
                    "label": ["body"] * len(body_sentences)
                })])
    return signatures

In [None]:
def getPleasantries(file_path):
    with open(file_path) as f:
        lines = f.readlines()
    return lines

In [None]:
def parseEnronEmails(file_path, chunk_size=5000, n=None):

    parsed_emails = pd.DataFrame()
    emails = pd.read_csv(file_path)

    while True:

        chunk = emails.sample(n=chunk_size, random_state=42)

        #estraggo subject e text da messaggio e metto in dataframe
        content = list(map(email.message_from_string, chunk['message']))
        subjects = [c["Subject"].strip() for c in content]
        bodies = list(map(getBodyFromContent, content))

        chunk_df = pd.DataFrame({
            "subject": subjects,
            "body": bodies
        })

        chunk_df = chunk_df[chunk_df["body"].str.len() <= 2500]

        chunk_df["subject"] = chunk_df["subject"].apply(clean_subject)

        #filtro i subject
        chunk_df = chunk_df[chunk_df["subject"].apply(subject_filters)]

        #aggiungo al datframe
        parsed_emails = pd.concat([parsed_emails, chunk_df])

        #finchè non ottengo la dimensione desiderata
        if n != None and len(parsed_emails) >= n:
            parsed_emails = parsed_emails.sample(n=n, random_state=42)
            break
    return parsed_emails




In [None]:
def split_body(text):
    res = []
    chunks = re.split("\n{2,}", text)
    for chunk in chunks:
        res.extend(nltk.sent_tokenize(chunk))
    return res

In [None]:
def cleanEnronSignatures(phrase):
    terms = [
        "Enron North America Corp",
        "Enron North America"
        "Enron"
    ]
    for term in terms:
        phrase = phrase.replace(term, "")
    return phrase


### Classes for the first step classification

We look for similarities between sentences in the reference datsets (like pleasantries and signatures) and sentences in the email messages

In [None]:
class PartialPhraseMatcher():

    """
    This class allows to check for matching between sentences based on common
    sequences of words of a specific length (partial string matching)
    """

    def __init__(self, removeStopwords=False, removePunct=True):
        """
        This class performs partial string matching comparing a corpus with a sample of strings.
        """
        self.removeStopwords = removeStopwords
        self.removePunct = removePunct

    def normalize(self, sentences):
        """
        normalize a sentence by removing stopwords and punctuation
        """
        def _normalize(text):
            #uniform characters
            text = unicodedata.normalize("NFD", text)
            text = text.encode("ascii", "ignore").decode("UTF-8")

            #remove punctuation, single spaces and lowercase
            text = text.lower()
            text = re.sub(f"\s+", " ", text)
            text = str.translate(text, str.maketrans("", "", string.punctuation))

            return text.strip()

        return list(map(_normalize, sentences))

    def getNgrams(self, entities, sizes):
        res = []
        for entity in entities:
            entity = entity.strip().split(" ")
            for size in sizes:
                subs = [" ".join(entity[i:i+size]) for i in range(len(entity)-size+1)]
                res.extend(subs)
        return res


    def match(self, entities, corpus, threshold=3, returnIndex=True):
        #normalize both the reference phrases and the corpus
        entities = self.normalize(entities)
        corpus = self.normalize(corpus)

        matches = []
        indexes = []
        substrings = self.getNgrams(entities, [threshold])

        for i, doc in enumerate(corpus):
            if any(s in doc for s in substrings):
                matches.append(doc)
                indexes.append(i)

        if returnIndex:
            return matches, indexes
        else:
            return matches


class BOWPhraseMatcher():

    """
    This class allows to check for matching between sentences based on the
    number common words
    """

    def __init__(self, removeStopwords=False, removePunct=True):
        """
        This class performs partial string matching comparing a corpus with a sample of strings.
        """
        self.removeStopwords = removeStopwords
        self.removePunct = removePunct

    def normalize(self, sentences):
        """
        normalize a sentence by removing stopwords and punctuation
        """
        def _normalize(text):
            #uniform characters
            text = unicodedata.normalize("NFD", text)
            text = text.encode("ascii", "ignore").decode("UTF-8")

            #remove punctuation, single spaces and lowercase
            text = text.lower()
            text = re.sub(f"\s+", " ", text)
            text = str.translate(text, str.maketrans("", "", string.punctuation))

            return text.strip()

        return list(map(_normalize, sentences))

    def getBOW(self, text):

            tokens = nltk.word_tokenize(text)
            tokens = [token for token in tokens if token not in string.punctuation]
            tokens = [token for token in tokens if token not in nltk.corpus.stopwords.words("english")]

            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(token) for token in tokens]

            return set(tokens)


    def match(self, entities, corpus, threshold=3, returnIndex=True):
        #normalize both the reference phrases and the corpus
        entities = self.normalize(entities)
        corpus = self.normalize(corpus)

        matches = []
        indexes = []
        ent_BOW = list(map(self.getBOW, entities))


        for i, doc in enumerate(corpus):

            doc_BOW = self.getBOW(doc)
            for ent in ent_BOW:
                if len(ent.intersection(doc_BOW)) > threshold:
                    matches.append(doc)
                    indexes.append(i)
                    break

        if returnIndex:
            return matches, indexes
        else:
            return matches

class USEPhraseMatcher():

    """
    This class allows to check for matching between sentences based on sentence
    embedding
    """

    def __init__(self, removeStopwords=True, removePunct=True):

        """
        This class performs partial string matching comparing a corpus with a sample of strings.
        Normalization is applied before the comparison
        """

        self.embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    def match(self, entities, corpus, threshold = 0.7, returnIndex=True):

        matches = []
        indexes = []

        entities_embeddings = self.embed(entities)
        corpus_embeddings = self.embed(corpus)

        similarity_matrix = cosine_similarity(corpus_embeddings, entities_embeddings)


        for i, doc in enumerate(corpus):
            similarity = max(similarity_matrix[i])

            if similarity > threshold:
                matches.append(doc)
                indexes.append(i)

        if returnIndex:
            return matches, indexes
        else:
            return matches



### Get the datsets


In [None]:
CHUNK_SIZE = 10000
DATASET_SIZE = 100000

#email messages
parsed_emails = parseEnronEmails(DIR+"dataset/emails.csv", chunk_size=CHUNK_SIZE, n=DATASET_SIZE)
corpus = parsed_emails["body"].tolist()
subjects = parsed_emails["subject"].tolist()

#reference datasets
pleasantries = getPleasantries(DIR+"dataset/pleasantires.txt")
signatures = getSignatures(DIR+"dataset/signatures")

### Cassify the sentences

In [None]:
#the classes have all the same methods so is sufficient to uncomment the desired Matcher
#matcher = USEPhraseMatcher()
#matcher = BOWPhraseMatcher()
matcher = PartialPhraseMatcher()

results = defaultdict(lambda: [])

for i in range(len(corpus)):
    phrases = split_body(corpus[i])
    _, sig_idx = matcher.match(signatures, phrases)
    _, ple_idx = matcher.match(pleasantries, phrases)
    _, body_idx = matcher.match([subjects[i]], phrases)

    #signatures and pleasantries some times share the same structure (for final
    #greetings or thanks) so we don't consider multiple labels for the same sentence

    sig_idx = list(set(sig_idx).difference(set(ple_idx)))
    results["signature"].extend([phrases[j] for j in sig_idx])
    results["pleasantry"].extend([phrases[j] for j in ple_idx])
    results["body"].extend([phrases[j] for j in body_idx])


In [None]:
#we need to remove domain specific terms not to influence the following training
results["signature"] = list(map(cleanEnronSignatures, results["signature"]))

In [None]:
#size of the datsets
n_samples = min(len(results["pleasantry"]), len(results["signature"]), len(results["body"]))
len(results["pleasantry"]), len(results["signature"]), len(results["body"])

(32107, 7365, 60967)

### Bert fine-tuning

In [None]:
#we fix the labels so we can use them bor both cases
#when we train and when we load the fine tuned model
labels_ = [
    "pleasantry",
    "signature",
    "body"
]

In [None]:
class BertFineTuner:

    """
    This class is an helper for the fine tuning of Bert for sentence classification
    """

    def __init__(self, labels, model_name='bert-base-uncased'):
        # load the tokenizer and the model


        tokenizer_path = "/".join(model_name.split("/")[:-1])
        tokenizer_path = tokenizer_path if tokenizer_path else model_name

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))
        self.num_labels = len(labels)
        self.labels = labels

        self.label2id = {label: i for i, label in enumerate(labels)}
        self.id2label = {i: label for label, i in self.label2id.items()}

    def _tokenize(self, sentences):
        return self.tokenizer(sentences["sentence"], padding="max_length", truncation=True, return_tensors='pt')


    def train(self, sentences, labels, eval_size=0.2, output_dir='./results', epochs=1, batch_size=8):
        """
        Trains the model
        """
        training_args = TrainingArguments(
            output_dir=output_dir,
            save_total_limit=1,
            evaluation_strategy="epoch",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
        )

        data = {
            "sentence": sentences,
            "label": [self.label2id[label] for label in labels]
        }

        dataset = Dataset.from_dict(data)
        tokenized_dataset = dataset.map(self._tokenize, batched=True)

        tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

        train_test_split = tokenized_dataset.train_test_split(test_size=eval_size)

        train_dataset = train_test_split['train']
        eval_dataset = train_test_split['test']

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset
        )

        self.trainer.train()

        self.model = self.trainer.model
        self.trainer.save_model(output_dir)
        self.tokenizer.save_pretrained(output_dir)

    def predict(self, sentences):
        """Performs the predictions"""
        inputs = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        dataset = Dataset.from_dict(inputs)
        predictions = self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = predictions.logits

        # Trasforma in probabilità e applica la soglia
        max_idx = np.argmax(logits.detach().numpy(), axis=1)
        return [self.id2label[i] for i in max_idx]


In [216]:
import wandb
wandb.init(mode="disabled")
#execute the fine tuning
phrases = []
labels = []

for k, v in results.items():
    phrases.extend(v[:n_samples])
    labels.extend([k] * n_samples)


bert_trainer = BertFineTuner(labels_)
bert_trainer.train(phrases, labels, epochs=1, output_dir=DIR+"results/bert_results")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/22095 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.0672,0.049068


Some running examples


In [217]:
emails = [
    """
    Greetings Steve.
    We have had many inquiries regarding our transportation agreements, specifically PGT, Kern River and El Paso.
    Most pressing is PGT's request to market capacity on our behalf. Although I do not believe we need to have PGT perform this function, I do believe we could mitigate some demand charge exposure for "the estate".
    Are you available to meet with Barry and I first thing tomorrow morning? We need to have an answer for PGT's request. We need to know what the contractual rights are for 1) the pipeline when a shipper declares bankruptcy and 2) our ability to mitigate demand charge exposure by releasing capacity on a short term basis (month at a time?).
    Better yet, can we unilaterally turn capacity back to any of these pipes or do we need to reach some type of court approved settlement?
    Obviously, we have many questions. Please advise of your ability to meet ASAP.

    Thanks,
    Stephanie
    """,

    """
    Welcome to UBS Warburg Energy, LLC.
    All UBS Warburg Energy, LLC employees are invited to join us for breakfast and happy hour on
    Friday, February 8, 2002.
    Breakfast, 8:15 a.m., here on the 3rd floor.
    Happy Hour, 3:00 p.m., appetizers and drinks (2),
    Fernando's Hideaway, 824 SW First Avenue (upstairs)
    We hope you will join us!
    Chris and Tim
    """,

    """
    Mark, further to our conversation... Mercado transacted with Enron North America under the terms and conditions of a financial transaction and these transactions have been or will be settled financially, however these transactions could have been set up as physical.  The structure of the transaction changes as a result of a few variables, including the price,day of cash settlement, and curtailment issues on San Jan deliveries which would then automatically convert the fixed price portion to 100% load factor and settlement would be on 100% of the volume regardless as to what physically flowed.

    Please call me if there are any questions regarding this note.

    Barry  Tycholiz
    Vice President, Enron North America
    """
]

In [219]:
results = pd.DataFrame()

bert_trainer = BertFineTuner(labels_, model_name=DIR+"results/bert_results/checkpoint-2210")
tokenizer = BertTokenizer.from_pretrained(DIR+"results/bert_results")

for row in emails:
    sentences = nltk.sent_tokenize(row)
    predictions = bert_trainer.predict(sentences)
    results = pd.concat([results, pd.DataFrame({
        "sentence": sentences,
        "prediction": predictions
    })])

results

Unnamed: 0,sentence,prediction
0,\n Greetings Steve.,signature
1,We have had many inquiries regarding our trans...,body
2,Most pressing is PGT's request to market capac...,body
3,Although I do not believe we need to have PGT ...,body
4,Are you available to meet with Barry and I fir...,body
5,We need to have an answer for PGT's request.,body
6,We need to know what the contractual rights ar...,body
7,"Better yet, can we unilaterally turn capacity ...",body
8,"Obviously, we have many questions.",pleasantry
9,Please advise of your ability to meet ASAP.,body


Can we predict something never seen?


We try to classify some sentences that havo no match in the reference datasets so we know that the model can classify never seen examples.

In [230]:
emails = [
    """
    Good mornig, this is an example of email message that should be classified correctly as a body.
    I'd love to receive a reply soon.
    Thank you for you patience.

    Nicola Rinaldi
    Università degli Studi di Milano
    nicola.rinaldi@studenti.unimi.it
    """
]

In [231]:
results = pd.DataFrame()
for row in emails:
    sentences = nltk.sent_tokenize(row)
    predictions = bert_trainer.predict(sentences)
    results = pd.concat([results, pd.DataFrame({
        "sentence": sentences,
        "prediction": predictions
    })])

results

Unnamed: 0,sentence,prediction
0,"\n Good mornig, this is an example of email...",body
1,I'd love to receive a reply soon.,pleasantry
2,Thank you for you patience.,pleasantry
3,Nicola Rinaldi\n Università degli Studi di ...,signature


## Argument Extraction

Find pair subject-message that are informative using the same strategy for the matching in the first fase

In [None]:
matcher = PartialPhraseMatcher()

valid_dataset = pd.DataFrame()

for j, (i, row) in enumerate(parsed_emails.iterrows()):

    # if j % 1000 == 0:
    #     print(f"{j}/{len(valid_dataset)}")
    if len(valid_dataset) > 100000:
        break

    #we keep only shorts subjects to keep the predictions not to verbose
    #the dataset contains not so much examples with our characteristics so we keep more examples
    if len(row['subject'].split()) > 100:
        continue

    res, _ = matcher.match([row["subject"]],[row["body"]])
    if res != []:
        valid_dataset = pd.concat([valid_dataset, pd.DataFrame({
            "subject": [row["subject"]],
            "body": [row["body"]]
        })])

print(len(valid_dataset))

38257


In [None]:
valid_dataset = parsed_emails.sample(n=30000)
t5_dataset = Dataset.from_pandas(valid_dataset)

# Split the datset for trining and evaluation
train_test_split = t5_dataset.train_test_split(test_size=0.10)

train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

train_dataset.shape, val_dataset.shape

((27000, 3), (3000, 3))

In [None]:
MODEL = 't5-small'
BATCH_SIZE = 48
NUM_PROCS = 16
EPOCHS = 5
OUT_DIR = DIR+'results/results_t5small'
MAX_LENGTH = 256

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    inputs = [example for example in examples["body"]]
    targets = examples["subject"]

    model_inputs = tokenizer(
        inputs,
        max_length=256,
        padding="max_length",
        truncation=True)

    # Tokenizziamo i target (oggetto)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            padding="max_length",
            truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tok_train = train_dataset.map(tokenize_function, batched=True)
tok_val = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/27000 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# Total parameters and trainable parameters.q
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=5,
    report_to='tensorboard',
    learning_rate=0.0001,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_train,
    eval_dataset=tok_val
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
history = trainer.train()
tokenizer.save_pretrained(DIR+'results/results_t5small')

Step,Training Loss,Validation Loss
500,0.2016,0.208017
1000,0.2147,0.18414
1500,0.1875,0.176734


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('drive/MyDrive/nlp_project/results/results_t5small/tokenizer_config.json',
 'drive/MyDrive/nlp_project/results/results_t5small/special_tokens_map.json',
 'drive/MyDrive/nlp_project/results/results_t5small/spiece.model',
 'drive/MyDrive/nlp_project/results/results_t5small/added_tokens.json')

In [None]:
model = T5ForConditionalGeneration.from_pretrained(DIR+'results/results_t5small/checkpoint-1688')
tokenizer = T5Tokenizer.from_pretrained(DIR+'results/results_t5small')

In [None]:
def predict_argument(text, model, tokenizer):
    input_text = f"{text}"
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )

    # Get correct sentence ids.
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )

    # Decode.
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    return corrected_sentence

In [None]:
emails = [
    """
   Hi Mark,

    I hope you're doing well! I'm hosting a Christmas party on Saturday at John's, and I'd love for you to join us. It’s going to be a fun evening with great food, drinks, and holiday cheer!

    Let me know if you can make it!

    Best,
    Emily
    """,

    """
    Hi Frank,

    I wanted to knowin on the progress of the project. How are things progressing? Please let me know if there’s anything that needs attention or if we’re on track.

    Thanks in advance!

    Regards,
    Tim
    """,

    """
    Hello July,

    I just wanted to see how you’ve been! It’s been a while since we last caught up, and I’d love to hear how things are going with you.

    Hope to hear from you soon!

    Thanks,
    Sarah
    """
]

In [None]:
for email_ in emails:
    print(predict_argument(email_, model, tokenizer))

Christmas Party
progress of the project
You’ve Been!
