In [None]:
!pip install gigachain -q
!pip install sentence-transformers -q
!pip install faiss-cpu -q
!pip install gigachain_community -q
!pip install gigachain-core -q
!pip install pypdf -q
!pip install nltk -q
!pip install rouge-score -q
!pip install transformers datasets -q
!pip install torchmetrics -q
!pip install evaluate -q
!pip install sacrebleu -q

In [None]:
import pandas as pd

In [None]:
questions_df = pd.read_excel("q_data.xlsx").Question
answers_df = pd.read_excel("q_data.xlsx").Answer
llm_answers_df = pd.read_excel("answers_ready.xlsx").Answer

In [None]:
llm_answers = []
for e in llm_answers_df:
  llm_answers.append(e)

answers = []
for e in answers_df:
  answers.append(e)

questions = []
for e in questions_df:
  questions.append(e)

#BLEU

In [None]:
#Импортируем функцию вычисления BLEU
from nltk.translate.bleu_score import corpus_bleu

#Подготовка данных в необходимую форму для вычесления метрики BLEU (приведение к токенизированному виду)
translations = []
for e in range(len(answers)):
  translations.append(answers[e].split())

#Оборачиваем ответы в необходимую форму для вычесления метрики BLEU
references = [[[ref]] for ref in llm_answers]

#Костыльное решение, такое, чтобы привелось в необходумую форму
references_list = []
i = 0
for e in range(len(references)):
  references_list.append([])
  references_list[i].append(references[e][0][0].split())
  i = i + 1

#Вычисляем BLEU Score
bleu_score_corpus = corpus_bleu(references_list, translations)
print("Corpus BLEU Score: ", bleu_score_corpus)

#Rouge

In [None]:
from rouge_score import rouge_scorer

In [None]:
r_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
df = pd.DataFrame(columns=['Question', 'Answer', 'LLM_Answer', 'rouge1', 'rouge2', 'rougeL'])
for e in range(len(questions)):
  candidate_summary = llm_answers[e]
  reference_summary = answers[e]
  r_scores = r_scorer.score(reference_summary, candidate_summary)
  new_row = {'Question': questions[e], 'Answer': answers[e], 'LLM_Answer': llm_answers[e], 'rouge1': r_scores['rouge1'], 'rouge2': r_scores['rouge2'], 'rougeL': r_scores['rougeL']}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

In [None]:
df

#Семантическая похожесть

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("deepvk/USER-bge-m3")
model = AutoModel.from_pretrained("deepvk/USER-bge-m3")

In [None]:
# Example sentences (already preprocessed)
tokens_answer = []
for e in range(len(answers)):
  tokens_answer.append(answers[e].split())

tokens_message = []
for e in range(len(llm_answers)):
  tokens_message.append(llm_answers[e].split())

In [None]:
# Convert tokens to input IDs
input_ids_messages = []
input_ids_answers = []

for e in range(len(tokens_answer)):
  input_ids_messages.append(torch.tensor(tokenizer.convert_tokens_to_ids(tokens_message[e])).unsqueeze(0))  # Batch size 1
  input_ids_answers.append(torch.tensor(tokenizer.convert_tokens_to_ids(tokens_answer[e])).unsqueeze(0))  # Batch size 1

In [None]:
# Obtain the BERT embeddings
outputs_messages = []
outputs_answers = []
embeddings_messages = []
embeddings_answers = []
with torch.no_grad():
    for e in range(len(tokens_answer)):
      outputs_messages.append(model(input_ids_messages[e]))
      outputs_answers.append(model(input_ids_answers[e]))
      embeddings_messages.append(outputs_messages[e].last_hidden_state[:, 0, :])  # [CLS] token
      embeddings_answers.append(outputs_answers[e].last_hidden_state[:, 0, :])  # [CLS] token

In [None]:
# Calculate similarity
similarity_score = []
i = 0
average_similarity_score = 0
df_sim_score = pd.DataFrame(columns=['Answer', 'LLM_Answer', 'Similarity Score'])
for e in range(len(tokens_answer)):
  similarity_score.append(cosine_similarity(embeddings_messages[e], embeddings_answers[e]))
  average_similarity_score += similarity_score[e]
  i +=1
  new_row = {'Answer': answers[e], 'LLM_Answer': llm_answers[e], 'Similarity Score': similarity_score[e]}
  df_sim_score = pd.concat([df_sim_score, pd.DataFrame([new_row])], ignore_index=True)
print("Average Similarity Score = ", average_similarity_score / i)
df_sim_score

#BERTScore

In [None]:
from torchmetrics.functional.text.bert import bert_score
preds = llm_answers
target = answers
df_BERTScore = pd.DataFrame(columns=['Answer', 'LLM_Answer', 'Precision', 'Recall', 'F1'])
b_score = bert_score(preds=preds , target=target, model_name_or_path="deepvk/USER-bge-m3")
for e in range(len(llm_answers)):
  new_row = {'Answer': answers[e], 'LLM_Answer': llm_answers[e], 'Precision': b_score['precision'][e], 'Recall': b_score['recall'][e], 'F1': b_score['f1'][e]}
  df_BERTScore = pd.concat([df_BERTScore, pd.DataFrame([new_row])], ignore_index=True)
df_BERTScore

#METEOR

In [None]:
import evaluate

In [None]:
meteor = evaluate.load("meteor")

In [None]:
meteor_res = []
meteor_res_avg = 0
i = 0
df_meteor = pd.DataFrame(columns=['Answer', 'LLM_Answer', 'meteor_results'])

predictions = [[pred] for pred in llm_answers]
references = [[ref] for ref in answers]

for e in range(len(answers)):
  meteor_res.append(meteor.compute(predictions=predictions[e], references=references[e]))
  meteor_res_avg += meteor_res[e]['meteor']
  new_row = {'Answer': answers[e], 'LLM_Answer': llm_answers[e], 'meteor_results': meteor_res[e]}
  df_meteor = pd.concat([df_meteor, pd.DataFrame([new_row])], ignore_index=True)
  i = i + 1

meteor_res_avg = meteor_res_avg / i
print(meteor_res_avg)
df_meteor

#TER

In [None]:
import sacrebleu

In [None]:
ter = evaluate.load("ter")

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

In [None]:
ref = [[ref] for ref in answers]
results = ter.compute(predictions=llm_answers,
                        references=ref,
                        case_sensitive=True)
print(results)

#chrF, chrF++

In [None]:
import evaluate

In [None]:
ref = [[ref] for ref in answers]
chrf = evaluate.load("chrf")
results = chrf.compute(predictions=llm_answers, references=ref)
print(results)