# Set number of sentences to evaluate

In [26]:
NROWS = 25

# Install / import

In [27]:
import torch

import transformers
from transformers import pipeline

!pip install datasets

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# Load the data

In [28]:
from datasets import load_dataset

data = load_dataset("osyvokon/pavlick-formality-scores")

In [29]:
# See the size of the dataset and the features
data

DatasetDict({
    train: Dataset({
        features: ['domain', 'avg_score', 'sentence'],
        num_rows: 9274
    })
    test: Dataset({
        features: ['domain', 'avg_score', 'sentence'],
        num_rows: 2000
    })
})

# Load the model

In [30]:
from transformers import pipeline

# We use Llama-3.2-3B-Instruct due to its compact size
model_id = "meta-llama/Llama-3.2-3B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [31]:
# We add a few examples from the train dataset to the prompt

def get_formality_llama(txt):
    prompt = """
    You are an expert linguist trained to assess the formality of sentences. Your task is to evaluate the formality in a given sentence,
    where the value "YES" indicates extremely formal language, such as what might be used in official documents or academic papers,
    and the value "NO" indicates informal language, typical of everyday communication or slang, casual text messages, or colloquial expressions.
    Consider factors such as vocabulary choice, sentence structure, and tone when assessing the formality.

    Examples:
    1) The formal sentence "His platform contains plans to fund drainage projects." should be assigned the value "YES".
    2) The formal sentence "The $30 million lost in Governor Bush's budget request 02-03 was slated for consumers on the wait list." should be assigned the value "YES".
    3) The formal sentence "With police officers dying in large numbers and Maoists carrying out bolder attacks, the debate around the insurgency has sharpened
    in India's intellectual salons and on the opinion pages and talk shows." should be assigned the value "YES".
    4) The formal sentence "Keep up the good work and if we may be of any assistance to you please let us know." should be assigned the value of "YES".
    5) The informal sentence "Watch out for the coaster car though." should be assigned the value "NO".
    6) The informal sentence "no!if you wana chat about it then leave me a cmment" should be assigned the value "NO".
    7)The informal sentence "I still live with my mom and I'm 23... Why not?" should be assigned the value "NO".
    8) The informal sentence "yes makedamnsure is my favorite song by taking back sunday" should be assigned the value "NO".

    Write only "YES" if the sentence is formal or write only "NO" if it is informal.
    """

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": "Assign YES or NO to this sentence: {0}".format(txt)},
    ]
    outputs = pipe(
        messages,
        max_new_tokens=4,
        temperature=0.5,
        pad_token_id=pipe.tokenizer.eos_token_id
    )

    yn_dict = {"YES": 1, "NO": 0}
    yes_or_no = outputs[0]["generated_text"][-1]["content"]
    return yn_dict[yes_or_no]

# Run and evaluate

In [32]:
pred = []
true_values = []

NROWS = NROWS or len(data["test"])

for i in range(NROWS):
    if i % 5 == 0:
        print("Processing sentence {0}...".format(i + 1))

    pred_score = get_formality_llama(data["test"][i]["sentence"])
    pred.append(pred_score)

    true_score = 1 if data["test"][i]["avg_score"] > 0 else 0
    true_values.append(true_score)


accuracy = accuracy_score(true_values, pred)
precision = precision_score(true_values, pred)
recall = recall_score(true_values, pred)
f1 = f1_score(true_values, pred)

print("""
For Llama-based classifier:
Accuracy - {0}
Precision - {1}
Recall - {2}
F1 - {3}
""".format(accuracy, precision, recall, f1))

Processing sentence 1...
Processing sentence 6...
Processing sentence 11...
Processing sentence 16...
Processing sentence 21...

For Llama-based classifier: 
Accuracy - 0.68
Precision - 0.8571428571428571
Recall - 0.46153846153846156
F1 - 0.6

