Skip to content
This repository has been archived by the owner on Mar 24, 2023. It is now read-only.

Commit

Permalink
add Answer and AnswerDetector; relates #18
Browse files Browse the repository at this point in the history
  • Loading branch information
mageirakos committed Aug 21, 2020
1 parent b0e1852 commit f781b47
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 123 deletions.
50 changes: 50 additions & 0 deletions lib/bot/detector/answer/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# general python
from uuid import uuid4
import datetime
import hashlib
import re


class Answer:
def __init__(
self,
question,
answer,
model,
start,
end,
confidence,
extended_answer,
extended_start,
extended_end,
metadata,
):
# Set unique ID
self.id = str(uuid4().hex)
self.user_question = question
# Since multiple answers can be created for the same user_question
# Let's create an id for the user_question
clean_question = str(question).lower()
# disregard all trailing question marks and spaces from the hashing
if clean_question[-1] == "?":
clean_question = re.sub("[ ?]*$", "", clean_question)
self.user_question_id = hashlib.md5(clean_question.encode("utf-8")).hexdigest()[:10]
self.answer = answer
self.start = start
self.end = end
self.confidence = confidence
self.extended_answer = extended_answer
self.extended_start = extended_start
self.extended_end = extended_end
self.model = model
# TODO add FAQ option as an origin
if "doc_id" in metadata:
self.origin = "documentation"
else:
self.origin = "questions"
# +00:00 since its utcnow() + same format as other dates saved in data_storage
self.created_at = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S+00:00")
self.metadata = metadata

def __str__(self):
return f"answer: {self.extended_answer}... , confidence: {self.confidence}''"
166 changes: 43 additions & 123 deletions lib/bot/detector/answer/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

# bot modules
import bot.config as config
from bot.detector.answer.base import Answer

# general python
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
from uuid import uuid4
import pandas as pd
import sys

Expand All @@ -26,12 +26,13 @@ def __init__(
self,
model="distilbert-base-cased-distilled-squad",
extended_answer_size=30,
handle_impossible_answer=True, # changed
max_answer_len=25, # changed
handle_impossible_answer=True,
max_answer_len=25,
max_question_len=64,
max_seq_len=256, # int,
num_answers_to_predict=5,
max_seq_len=256,
num_answers_to_predict=3,
doc_stride=128,
device=0,
):
"""
<!> Default values from source code for transformers.pipelines:
Expand All @@ -42,28 +43,36 @@ def __init__(
("max_question_len", 64)
("handle_impossible_answer", False)
:param model : name of the transformer model for QA (default is distilbert-base-uncased-distilled-squad)
:param model : name of the transformer model for QA (default is distilbert-base-cased-distilled-squad)
:param extended_answer_size : Number of character before and after the answer detected by our
model that are returned to give more context for the user. (default is 50)
:param handle_impossible_answer :
:param max_answer_len :
:param max_question_len :
:param max_seq_len :
:param num_answers_to_predict : num of answers that are predicted per context (default is 3)
:param doc_stride :
model that are returned to give more context for the user. (default is 30)
:param handle_impossible_answer : True if we wish to return impossible/empty answers, False otherwise (default is True)
:param max_answer_len : maximum length of an answer (default is 25)
:param max_question_len : maximum length of a question (default is 64)
:param max_seq_len : maximum length of one input sequence (default 256)
:param num_answers_to_predict : num of answers that are predicted per document (default is 3)
:param doc_stride : length of the split in the sliding window documents longer than max_sq_len.
:param device : if < 0 -> use cpu
if >=0 -> use gpu
"""
# load model

self.model_name = model
try:
qa_model = AutoModelForQuestionAnswering.from_pretrained(
config.MODELS_DIR + model
config.MODELS_DIR + self.model_name
)
qa_tokenizer = AutoTokenizer.from_pretrained(
config.MODELS_DIR + self.model_name
)
qa_tokenizer = AutoTokenizer.from_pretrained(config.MODELS_DIR + model)
except Exception as _e:
print(_e)
sys.exit(f"Make sure that the model exists under {config.MODELS_DIR}")
# assign attributes
self.model = pipeline(
"question-answering", model=qa_model, tokenizer=qa_tokenizer, framework="pt"
"question-answering",
model=qa_model,
tokenizer=qa_tokenizer,
framework="pt",
device=device,
)
self.extended_answer_size = extended_answer_size
self.num_answers_to_predict = num_answers_to_predict
Expand All @@ -73,16 +82,17 @@ def __init__(
self.max_seq_len = max_seq_len
self.doc_stride = doc_stride

def predict(self, question, documents, top_k=None):
def predict(self, question, documents, top_k=1):
"""
Use this method to predict answer(s) based on input
question and contexts
:param question : question string
:type question : str
:param documents : pd.DataFrame that contains 'context' and other metadata
:type contexts : pandas DataFrame object
:param topk : number of answers to predict for each context (default is 3)
Use this method to return top_k answer(s) based on input
question and documents.
:param question : question string
:type question : str
:param documents : pd.DataFrame that contains 'context' and other data
:type documents : pandas DataFrame
:param topk : number of answers to return for each document (default is 1)
:returns top_k_answers : list of top_k number of Answer objects
"""

answers = []
Expand All @@ -104,6 +114,7 @@ def predict(self, question, documents, top_k=None):
max_seq_len=self.max_seq_len,
doc_stride=self.doc_stride,
)
# reason for KeyError: https://github.com/huggingface/transformers/issues/5910
except KeyError as _e:
continue
except Exception as _other_e:
Expand All @@ -119,34 +130,30 @@ def predict(self, question, documents, top_k=None):
if pred["answer"]:
if pred["score"] > best_score:
best_score = pred["score"]
answer = self._create_answer_object(pred, doc)
answer = self._create_answer_object(question, pred, doc)
answers.append(answer)
else:
print("No answer was predicted for this document!")
# print(pred)
# no_ans_score = pred["score"]

if best_score > best_overall_score:
best_overall_score = best_score

# sort answers by their `confidence` and select top-k
sorted_answers = sorted(answers, key=lambda k: k.confidence, reverse=True)
# print("SORTED ANSWERS")
# sorted_results = {"question": question,
# "answers": [answer.__dict__ for answer in sorted_answers] }
# print(sorted_results)

top_k_answers = sorted_answers[:top_k]

return top_k_answers

def _create_answer_object(self, pred, doc):
def _create_answer_object(self, question, pred, doc):
extended_start = max(0, pred["start"] - self.extended_answer_size)
extended_end = min(len(doc.context), pred["end"] + self.extended_answer_size)
# drop extra metadata columns
# errors ignored for when its Question metadata and 'body' column doesn't exist
# errors ignored for when we have Question metadata and the 'body' column doesn't exist
metadata = doc.drop(["context", "body", "query"], errors="ignore").to_dict()
answer = Answer(
question=question,
model=self.model_name,
answer=pred["answer"],
start=pred["start"],
end=pred["end"],
Expand All @@ -157,90 +164,3 @@ def _create_answer_object(self, pred, doc):
metadata=metadata,
)
return answer


class Answer:
def __init__(
self,
answer,
start,
end,
confidence,
extended_answer,
extended_start,
extended_end,
metadata,
):
# Set unique ID
self.id = str(uuid4().hex)
self.answer = answer
self.start = start
self.end = end
self.confidence = confidence
self.extended_answer = extended_answer
self.extended_start = extended_start
self.extended_end = extended_end
self.metadata = metadata

def __str__(self):
return f"answer: {self.extended_answer}... , confidence: {self.confidence}''"


##############

if __name__ == "__main__":
from bot.searcher.base import SearchEngine
from bot.database.sqlite import Database

USER_QUESTION = "How are rucio account authenticated?"
db = Database("data_storage.db", "docs")
answer_detector = AnswerDetector(
"distilbert-base-cased-distilled-squad", num_answers_to_predict=1
)
docs_se = SearchEngine()
docs_se.load_index(db=db)
db.close_connection()
results_from_docs = docs_se.search(USER_QUESTION, top_n=5)
print("RETRIEVED RESULTS:")
print(results_from_docs)
# answers = list of answer objects
answers = answer_detector.predict(USER_QUESTION, results_from_docs, top_k=3)
print("ANSWER OBJECTS:")
results = {
"question": USER_QUESTION,
"answers": [answer.__dict__ for answer in answers],
}
print(results)
print()
print("type")
print(type(answers))
for i, answer in enumerate(answers):
print(i)
print(f"{USER_QUESTION}")
print(answer)
url = answer.metadata["url"]
print(f"for more info check {url}")

# from bot.searcher.question import QuestionSearchEngine

# question_se = QuestionSearchEngine()
# question_se.load_index(db=db)
# db.close_connection()
# results_from_questions = question_se.search(USER_QUESTION, top_n = 10)
# print('RETRIEVED RESULTS:')
# print(results_from_questions)
# # answers = list of answer objects
# answers = answer_detector.predict(USER_QUESTION, results_from_questions, top_k=3)
# print("ANSWER OBJECTS:")
# results = {"question": USER_QUESTION,
# "answers": [answer.__dict__ for answer in answers] }
# print(results)
# print()
# print('type')
# print(type(answers))
# for i, answer in enumerate(answers):
# print(i)
# print(f"{USER_QUESTION}")
# print(answer)
# most_similar_question = answer.metadata["question"]
# print(f'The most similar question was: {most_similar_question}')

0 comments on commit f781b47

Please sign in to comment.