In [1]:
import os
import torch
import pandas as pd

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
from tqdm.auto import tqdm

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Replace "0" with the appropriate GPU id
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [3]:
df_val = pd.read_csv('../data/dataset/val.csv')

In [4]:
df_val.head()

Unnamed: 0,title,question,answer
0,SD za pronájem vozidla,"Dobrý den, chci se zeptat, zda jsem povinen pl...","Dobrý den, předmětem silniční daně je mimo jin..."
1,Vraceni dane,"Dobrý den, 2.12.2019 (datum na smlouvě o přepi...","Dobrý den, Daň z nabytí nemovitých věcí byla z..."
2,Přerušené studium,"Dobrý den, ráda bych se zeptala na placení dan...","Dobrý den, Jestliže pokračujete v dalším studi..."
3,Darovací smlouva,"Dobrý den, jsem matka a chci darovat bezúplatn...","Dobrý den, z Vašeho dotazu těžko usuzovat, jel..."
4,Kde platit sociální a zdravotnípojištění,"Dobrý den, ráda bych se zeptala na platbu soci...","Dobrý den, dle Vašeho dotazu vyplývá, že pokud..."


In [5]:
name = 'BUT-FIT/CSTinyLlama-1.2B'

config = AutoConfig.from_pretrained(name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    name,
    config=config,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device='cuda:0')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_val)

context_start = "Odpověz na daňovou otázku jako odborník. Otazka: "
context_end = "Odpoveď: "

# Define a function to generate an answer for a given question
def generate_answer(example):
    question = example['question']
    prompt = context_start + question + context_end
    with torch.autocast('cuda', dtype=torch.float16):
        output = pipe(
            prompt,
            max_new_tokens=256,
            top_p=0.95,
            repetition_penalty=1.05,
            do_sample=True,
            use_cache=True
        )
    gen_answer = output[0]['generated_text'][len(prompt) + 1:]
    gen_answer = gen_answer.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    gen_answer = ' '.join(gen_answer.split())
    return {'gen_answer': gen_answer}

PyTorch version 2.2.2 available.


In [None]:
# Apply the function to the dataset
results = dataset.map(generate_answer, batched=False)

# Convert the results back to a DataFrame
df_val['gen_answer'] = results['gen_answer']
df_val.to_csv('../output/val_with_gen_answers.csv', index=False)

In [12]:
import gc
# clear gpu memory
torch.cuda.empty_cache()
gc.collect()

4320

In [3]:
from comet import download_model, load_from_checkpoint

df_val = pd.read_csv('../output/val_with_gen_answers.csv')
model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path, strict=False)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/home/savchart/.cache/pypoetry/virtualenvs/bp-AVCBgW1--py3.10/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [4]:
data = [
    {
        "src": df_val['question'][i],
        "mt": df_val['gen_answer'][i],
        "ref": df_val['answer'][i]
    }
    for i in range(len(df_val))
]

In [5]:
# Predict scores using COMET
model_output = model.predict(data, batch_size=8, gpus=1)

# Add scores to the dataframe
df_val['score'] = model_output.scores
# save
df_val.to_csv('../output/val_with_gen_answers_and_scores.csv', index=False)

# Optionally, print or log the average time per question
print(f"Average COMET score: {sum(model_output.scores) / len(model_output.scores):.2f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 428/428 [06:11<00:00,  1.15it/s]


Average COMET score: 0.68


In [7]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

api_key = os.environ["api_key"]
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

In [21]:
import requests
import json
import re

# Define the function to compare generated and reference answers
def compare_answers(gen_answer, answer, headers):
    prompt = (
        f"""
        Compare the following source text and reference text. 
        Provide the answers to the questions in a single vector of 6 numbers in the format: [1, 0, 1, 0, 1, 0].
        Each question should be answered with YES (1) or NO (0).
        
        Source text: "{gen_answer}"
        Reference text: "{answer}"
        
        Accuracy:
        1. Has any content been inappropriately omitted from the text? (YES (1) or NO (0))
        2. Does the text exhibit any misses that change the intended meaning of the text? (YES (1) or NO (0))
        3. Are there any other accuracy issues that affect the translation? (YES (1) or NO (0))
        
        Fluency:
        4. Are there any grammar problems in the text? (YES (1) or NO (0))
        5. Are there any stylistic problems in the text? (YES (1) or NO (0))
        6. Are there any other fluency problems in the text? (YES (1) or NO (0))
        
        Please output the results in a single vector format, e.g., [1, 0, 1, 0, 1, 0].
        """
    )
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": context_start},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, data=json.dumps(data))
    
    if response.status_code == 200:
        response_text = response.json()["choices"][0]['message']['content'].replace('\n', ' ')
        print(f"Raw response: {response_text}")  # Debugging line
        
        # Use regex to extract the vector safely
        match = re.search(r'\[(.*?)\]', response_text)
        if match:
            response_list = match.group(1).split(', ')
            try:
                response_list = [int(x) for x in response_list]
                avg_score = sum(response_list) / len(response_list)
                return response_list, avg_score
            except ValueError as ve:
                print(f"ValueError while converting to integers: {ve}")
                return [0, 0, 0, 0, 0, 0], 0
        else:
            print("No valid vector found in the response.")
            return [0, 0, 0, 0, 0, 0], 0
    else:
        print(f"Failed to validate API Key: HTTP {response.status_code} - {response.text}")
        return [0, 0, 0, 0, 0, 0], 0

In [22]:
# Initialize lists to store comparison results and average scores
comparison_results = []
avg_scores = []

# Iterate over the dataframe rows
for i in tqdm(range(len(df_val)), desc="Comparing answers", unit="comparison"):
    gen_answer = df_val['gen_answer'][i]
    answer = df_val['answer'][i]
    comparison_result, avg_score = compare_answers(gen_answer, answer, headers)
    comparison_results.append(comparison_result)
    avg_scores.append(avg_score)

Comparing answers:   0%|          | 0/3419 [00:00<?, ?comparison/s]

Raw response: [0, 0, 1, 0, 0, 1]
Raw response: [1, 1, 1, 1, 1, 1]   1. An inappropriate omission was made as the source text does not mention the cancellation of the real estate acquisition tax.  2. There are misses that change the intended meaning as the source text discusses tax issues for the year 2019, while the reference text provides updated information for 2020. 3. Other accuracy issues include the lack of translation for specific legal terms and concepts. 4. Grammar problems are present in the source text, such as sentence structure and word choices. 5. Stylistic problems can be seen in the lack of consistency in terminology and tone between the source and reference texts. 6. Other fluency problems include the need for better cohesion and coherence in the translation to accurately convey the legal and taxation-related information.
Raw response: [1, 1, 1, 1, 0, 1]
Raw response: [1, 1, 1, 1, 0, 1]
Raw response: [1, 1, 1, 1, 1, 1]  1. Has any content been inappropriately omitted f

In [23]:
df_val['comparison_result'] = comparison_results
df_val['avg_score'] = avg_scores

print(f"Average COMET score: {sum(df_val['score']) / len(df_val['score']):.2f}")
print(f"Average evaluation score: {sum(df_val['avg_score']) / len(df_val['avg_score']):.2f}")

df_val.head()
df_val.to_csv('../output/val_with_gen_answers_scores_and_eval.csv', index=False)

Average COMET score: 0.68
Average evaluation score: 0.55
