In [1]:
# !pip install mxnet
# !pip install gluonnlp pandas tqdm
# !pip install sentencepiece
# !pip install transformers
# !pip install torch

In [None]:
!pip install sentence-transformers

In [2]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

In [28]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import json
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
def f1_score(ground_truth, prediction):
    # Split the answers into word tokens
    ground_truth_tokens = ground_truth.split()
    prediction_tokens = prediction.split()

    # Count the occurrences of each word
    ground_truth_counter = Counter(ground_truth_tokens)
    prediction_counter = Counter(prediction_tokens)

    # Calculate the number of common tokens
    common_tokens = ground_truth_counter & prediction_counter
    num_common_tokens = sum(common_tokens.values())

    # If there are no common tokens, return F1 score of 0
    if num_common_tokens == 0:
        return 0.0

    # Precision: Proportion of predicted tokens that are correct
    precision = num_common_tokens / len(prediction_tokens)

    # Recall: Proportion of ground truth tokens that are predicted
    recall = num_common_tokens / len(ground_truth_tokens)

    # F1 Score: Harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall)

    return f1

st_model = SentenceTransformer('all-MiniLM-L6-v2')

In [33]:
conv_qa_path = "drive/MyDrive/24_1_DL/Final_Project/conversation_qa.json"
doc_qa_path = "drive/MyDrive/24_1_DL/Final_Project/document_qa.json"

with open(conv_qa_path, "r", encoding="utf-8") as conv_qa, open(doc_qa_path, "r", encoding="utf-8") as doc_qa:
  data_conv = json.load(conv_qa)
  data_doc = json.load(doc_qa)

  if len(data_conv['items']) != len(data_doc['items']):
    print("WTF")
    raise ValueError

  conv_f1_scores, doc_f1_scores = [], []
  conv_similarities, doc_similarities = [], []

  for idx in range(len(data_conv['items'])):
    conv_item = data_conv['items'][idx]
    doc_item = data_doc['items'][idx]

    if conv_item['question'] != doc_item['question'] or conv_item['answer'] != doc_item['answer']:
      print("WTF")
      raise ValueError

    # conversation-based
    paragraph = conv_item['conv']
    question = conv_item['question']
    answer = conv_item['answer'].lower()

    question_tokens = tokenizer.tokenize('[CLS]' + question + '[SEP]')
    paragraph_tokens = tokenizer.tokenize(paragraph + '[SEP]')
    tokens = question_tokens + paragraph_tokens
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    segment_ids = [0] * len(question_tokens)
    segment_ids += [1] * len(paragraph_tokens)

    input_ids = torch.tensor([input_ids])
    segment_ids = torch.tensor([segment_ids])

    output = model(input_ids, token_type_ids=segment_ids)

    start_scores, end_scores = output['start_logits'], output['end_logits']

    conv_start_index = torch.argmax(start_scores)
    conv_end_index = torch.argmax(end_scores)

    conv_predicted_tokens = tokens[conv_start_index:conv_end_index+1]
    conv_predicted_answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(conv_predicted_tokens)).lower()

    conv_f1 = f1_score(answer, conv_predicted_answer)
    conv_f1_scores.append(conv_f1)

    embedding_answer = st_model.encode(answer)
    embedding_pred_answer = st_model.encode(conv_predicted_answer)

    conv_similarity = float(cosine_similarity([embedding_answer], [embedding_pred_answer])[0][0])
    conv_similarities.append(conv_similarity)

    # document-based
    paragraph = doc_item['doc']
    question = doc_item['question']
    answer = doc_item['answer'].lower()

    question_tokens = tokenizer.tokenize('[CLS]' + question + '[SEP]')
    paragraph_tokens = tokenizer.tokenize(paragraph + '[SEP]')
    tokens = question_tokens + paragraph_tokens
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    segment_ids = [0] * len(question_tokens)
    segment_ids += [1] * len(paragraph_tokens)

    input_ids = torch.tensor([input_ids])
    segment_ids = torch.tensor([segment_ids])

    output = model(input_ids, token_type_ids=segment_ids)

    start_scores, end_scores = output['start_logits'], output['end_logits']

    doc_start_index = torch.argmax(start_scores)
    doc_end_index = torch.argmax(end_scores)

    doc_predicted_tokens = tokens[doc_start_index:doc_end_index+1]
    doc_predicted_answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(doc_predicted_tokens)).lower()

    doc_f1 = f1_score(answer, doc_predicted_answer)
    doc_f1_scores.append(doc_f1)

    embedding_answer = st_model.encode(answer)
    embedding_pred_answer = st_model.encode(doc_predicted_answer)

    doc_similarity = float(cosine_similarity([embedding_answer], [embedding_pred_answer])[0][0])
    doc_similarities.append(doc_similarity)

    # Comparing each results
    print(f'\n------- Example {idx} -------')
    print(f'Question {idx}: {question}')
    print(f'Original answer: {answer}')
    print(f'Predicted answer(conv): {conv_predicted_answer}')
    print(f'Predicted answer(doc): {doc_predicted_answer}')
    print(f'F1 score(conv): {round(conv_f1, 2)}')
    print(f'F1 score(doc): {round(doc_f1, 2)}')
    print(f'Similarity(conv): {round(conv_similarity, 2)}')
    print(f'Similarity(doc): {round(doc_similarity, 2)}')

  print(f'\n\nAverage F1 score(conv): {sum(conv_f1_scores) / len(conv_f1_scores)}')
  print(f'Average F1 score(doc): {sum(doc_f1_scores) / len(doc_f1_scores)}')
  print(f'Average Similarity(conv): {sum(conv_similarities) / len(conv_similarities)}')
  print(f'Average Similarity(doc): {sum(doc_similarities) / len(doc_similarities)}')



------- Example 0 -------
Question 0: What was the most number of people Tom has met during a holiday?
Original answer: 25
Predicted answer(conv): 25
Predicted answer(doc): 25
F1 score(conv): 1.0
F1 score(doc): 1.0
Similarity(conv): 1.0
Similarity(doc): 1.0

------- Example 1 -------
Question 1: Where does Tom plan to go for a hike?
Original answer: diablo
Predicted answer(conv): diablo
Predicted answer(doc): diablo
F1 score(conv): 1.0
F1 score(doc): 1.0
Similarity(conv): 1.0
Similarity(doc): 1.0

------- Example 2 -------
Question 2: When was the last time Jerry saw his old friend?
Original answer: june
Predicted answer(conv): june
Predicted answer(doc): june
F1 score(conv): 1.0
F1 score(doc): 1.0
Similarity(conv): 1.0
Similarity(doc): 1.0

------- Example 3 -------
Question 3: What does Tom think about rain?
Original answer: gloomy
Predicted answer(conv): 
Predicted answer(doc): surprising that the rain started early this year
F1 score(conv): 0.0
F1 score(doc): 0.0
Similarity(conv):