# Set number of sentences to evaluate

In [17]:
NROWS = 25

# Install / import

In [18]:
%%capture
from google.colab import files

!pip install datetime
from datetime import datetime

import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch

import transformers
from transformers import pipeline

!pip install datasets

# Load the data

In [19]:
from huggingface_hub import login

from datasets import load_dataset


login()
data = load_dataset("osyvokon/pavlick-formality-scores")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
# See the size of the dataset and the features
data

DatasetDict({
    train: Dataset({
        features: ['domain', 'avg_score', 'sentence'],
        num_rows: 9274
    })
    test: Dataset({
        features: ['domain', 'avg_score', 'sentence'],
        num_rows: 2000
    })
})

# Load the model

In [21]:
from transformers import pipeline


# We use Llama-3.2-3B-Instruct due to its compact size
model_id = "meta-llama/Llama-3.2-3B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [22]:
# We add a few examples from the train dataset to the prompt

def get_formality_llama(txt):
    """
    txt: str
        The text to evaluate
    """

    prompt = """
    You are an expert linguist trained to assess the formality of sentences. Your task is to evaluate the formality in a given sentence,
    where the value "YES" indicates extremely formal language, such as what might be used in official documents or academic papers,
    and the value "NO" indicates informal language, typical of everyday communication or slang, casual text messages, or colloquial expressions.
    Consider factors such as vocabulary choice, sentence structure, and tone when assessing the formality.

    Examples:
    1) The formal sentence "His platform contains plans to fund drainage projects." should be assigned the value "YES".
    2) The informal sentence "Watch out for the coaster car though." should be assigned the value "NO".
    3) The informal sentence "no!if you wana chat about it then leave me a cmment" should be assigned the value "NO".
    4) The formal sentence "The $30 million lost in Governor Bush's budget request 02-03 was slated for consumers on the wait list." should be assigned the value "YES".
    5) The informal sentence "yes makedamnsure is my favorite song by taking back sunday" should be assigned the value "NO".
    6) The formal sentence "With police officers dying in large numbers and Maoists carrying out bolder attacks, the debate around the insurgency has sharpened
    in India's intellectual salons and on the opinion pages and talk shows." should be assigned the value "YES".
    7) The formal sentence "Keep up the good work and if we may be of any assistance to you please let us know." should be assigned the value of "YES".
    8) The informal sentence "I still live with my mom and I'm 23... Why not?" should be assigned the value "NO".

    Write only "YES" if the sentence is formal or write only "NO" if it is informal.
    """

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": "Assign YES or NO to this sentence: {0}".format(txt)},
    ]
    outputs = pipe(
        messages,
        max_new_tokens=4,
        temperature=0.5,
        pad_token_id=pipe.tokenizer.eos_token_id
    )

    yn_dict = {"YES": 1, "NO": 0}

    output_y_or_n = outputs[0]["generated_text"][-1]["content"]
    if output_y_or_n in yn_dict.keys():
        return yn_dict[output_y_or_n]
    else:
        print("The model returned neither YES or NO for the following text: {0}".format(txt))

# Run, evaluate and save results

In [24]:
preds, true_values, processed_txts = [], [], []

NROWS = NROWS or len(data["test"])

for i in range(NROWS):
    if i % 5 == 0:
        print("Processing sentence {0}...".format(i + 1))

    pred_score = get_formality_llama(data["test"][i]["sentence"])
    if pred_score in [0, 1]:
        preds.append(pred_score)
        processed_txts.append(data["test"][i]["sentence"])
    else:
        print(pred_score)

    true_score = 1 if data["test"][i]["avg_score"] > 0 else 0
    true_values.append(true_score)

accuracy = accuracy_score(true_values, preds)
precision = precision_score(true_values, preds)
recall = recall_score(true_values, preds)
f1 = f1_score(true_values, preds)

print("""
For Llama-based classifier:
Accuracy - {0}
Precision - {1}
Recall - {2}
F1 - {3}
""".format(accuracy, precision, recall, f1))

# Save results to dataframe and download report
df_results = pd.DataFrame({'processed_text': processed_txts,
              'prediction': preds,
              'class': true_values},
              columns=['processed_text', 'prediction', 'class'])

download_time = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

file_name = "llama_classification_report_{0}.csv".format(download_time)
df_results.to_csv(file_name, encoding="utf-8", sep=";", index=False)

files.download(file_name)

Processing sentence 1...
Processing sentence 6...
Processing sentence 11...
Processing sentence 16...
Processing sentence 21...

For Llama-based classifier:
Accuracy - 0.72
Precision - 0.875
Recall - 0.5384615384615384
F1 - 0.6666666666666666



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>