Skip to content
This repository has been archived by the owner on Mar 24, 2023. It is now read-only.

Commit

Permalink
Merge pull request #29 from rucio/develop
Browse files Browse the repository at this point in the history
expand to use FAQs, add AnswerDetection module, add build_donkeybot; closes #28 #27 #26 #18
  • Loading branch information
mageirakos committed Aug 22, 2020
2 parents 3e269f8 + 13e981f commit f792947
Show file tree
Hide file tree
Showing 33 changed files with 1,389 additions and 73 deletions.
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
# folders
/some-folder
/.vscode
/data
/virt
/models

# build info
/dist
.eggs/
/DonkeyBot.egg-info
/lib/DonkeyBot.egg-info

# data (created locally on your machine after running build_donkeybot.py)
/data/old
/data/conversation_dict.pickle
/data/data_storage.db
/data/docs_input_data.db
/data/emails_input_data.db
/data/issues_input_data.db

# files
__pycache__
.pytest_cache
2 changes: 2 additions & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## Individual contributors to the source code
- Vasilis Mageirakos <b.mageirakos@gmail.com>, 202
41 changes: 41 additions & 0 deletions data/faq.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[{
"answer": "You can find the contact information for all authors and contributors to Rucio on:\nhttps://github.com/rucio/rucio/blob/master/AUTHORS.rst",
"author": "Vasilis",
"created_at": "2020-08-22 13:18:10+00:00",
"faq_id": "faq_847d5843a5e040c6ae272f01110d7728",
"keywords": "authors,contributors",
"question": "Who are the Rucio authors?"
},
{
"answer": "To register a data set you can use the python replica client:\nhttps://github.com/rucio/rucio/blob/de7ad68cabe8bcd39a2e7301ffab2642e2b70256/lib/rucio/client/replicaclient.py#L195",
"author": "Martin",
"created_at": "2020-08-22 13:24:18+00:00",
"faq_id": "faq_a754aa4a921b4239abbc7e578bf4a73f",
"keywords": "dataset,register",
"question": "What is the command to use to register a data set?"
},
{
"answer": "If you define the RSE as non-deterministic, you can rely on your own directory structure and just have to register the PFN where the file is located as well.\nIn case of a deterministic RSE, you would have to place the files based on the deterministic function being used (Most likely the hashing).\nIn that case you can do the registration as well, but Rucio would require that the file is based in a directory based on the has function.",
"author": "Martin",
"created_at": "2020-08-22 13:25:55+00:00",
"faq_id": "faq_74551d3125cb4925b0352092b2732167",
"keywords": "dataset,directory,structure",
"question": "What is the logic behind the dataset directory structuring?"
},
{
"answer": "Not easily, this would require manual changes in the database. \nFor example, you would need to update all rules that use the RSE name in their RSE expression.",
"author": "Dimitrios",
"created_at": "2020-08-22 13:27:01+00:00",
"faq_id": "faq_d34159f4297c4eb6894820dbe42587d5",
"keywords": "rse,rename",
"question": "Is it possible to rename an RSE?"
},
{
"answer": "The full schema in SQL Alchemy description is at https://github.com/rucio/rucio/blob/master/lib/rucio/db/sqla/models.py\nThis is the one being used if you initiate the database with alembic.\nIn parallel we also maintain a schema file for oracle, this is equivalent to the models file from sqlalchemy, but has some oracle specific optimizations.\nhttps://github.com/rucio/rucio/blob/master/etc/sql/oracle/schema.sql",
"author": "Martin",
"created_at": "2020-08-22 13:27:49+00:00",
"faq_id": "faq_98135db779aa4ead9544b816fadbb654",
"keywords": "database,schema",
"question": "Where can I see the database schema of the Rucio DB?"
}
]
File renamed without changes.
55 changes: 55 additions & 0 deletions lib/bot/answer/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# general python
from uuid import uuid4
import datetime
import hashlib
import re


class Answer:
def __init__(
self,
question,
answer,
model,
start,
end,
confidence,
extended_answer,
extended_start,
extended_end,
metadata,
):
# Set unique ID
self.id = str(uuid4().hex)
self.user_question = question
# Since multiple answers can be created for the same user_question
# Let's create an id for the user_question
clean_question = str(question).lower()
# disregard all trailing question marks and spaces from the hashing
if clean_question[-1] == "?":
clean_question = re.sub("[ ?]*$", "", clean_question)
self.user_question_id = hashlib.md5(clean_question.encode("utf-8")).hexdigest()[
:10
]
self.answer = answer
self.start = start
self.end = end
self.confidence = confidence
self.extended_answer = extended_answer
self.extended_start = extended_start
self.extended_end = extended_end
self.model = model
# TODO add FAQ option as an origin
if "doc_id" in metadata:
self.origin = "documentation"
elif "faq_id" in metadata:
self.origin = "faq"
else:
self.origin = "questions"
# +00:00 since its utcnow() + same format as other dates saved in data_storage
self.created_at = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S+00:00")
self.label = None
self.metadata = metadata

def __str__(self):
return f"answer: {self.extended_answer}... , confidence: {self.confidence}''"
170 changes: 170 additions & 0 deletions lib/bot/answer/detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Donkeybot's AnswerDetector utilizes Hugginface's Transformers
# Usefull links:
# 1) https://huggingface.co/transformers/task_summary.html#extractive-question-answering (example)
# 2) https://huggingface.co/transformers/model_doc/bert.html (bert)
# 3) https://huggingface.co/transformers/main_classes/tokenizer.html (tokenizer)
# 4) https://stackoverflow.com/questions/59701981/bert-tokenizer-model-download (tokenizer)
# 5) https://huggingface.co/transformers/pretrained_models.html (models)
# 6) https://huggingface.co/transformers/_modules/transformers/pipelines.html (pipelines)

# bot modules
import bot.config as config
from bot.answer.base import Answer

# general python
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
import pandas as pd
import sys


class AnswerDetector:
"""Answer Detector"""

def __init__(
self,
model="distilbert-base-cased-distilled-squad",
extended_answer_size=30,
handle_impossible_answer=True,
max_answer_len=25,
max_question_len=64,
max_seq_len=256,
num_answers_to_predict=3,
doc_stride=128,
device=0,
):
"""
<!> Default values from source code for transformers.pipelines:
("topk", 1)
("doc_stride", 128)
("max_answer_len", 15)
("max_seq_len", 384)
("max_question_len", 64)
("handle_impossible_answer", False)
:param model : name of the transformer model for QA (default is distilbert-base-cased-distilled-squad)
:param extended_answer_size : Number of character before and after the answer detected by our
model that are returned to give more context for the user. (default is 30)
:param handle_impossible_answer : True if we wish to return impossible/empty answers, False otherwise (default is True)
:param max_answer_len : maximum length of an answer (default is 25)
:param max_question_len : maximum length of a question (default is 64)
:param max_seq_len : maximum length of one input sequence (default 256)
:param num_answers_to_predict : num of answers that are predicted per document (default is 3)
:param doc_stride : length of the split in the sliding window documents longer than max_sq_len.
:param device : if < 0 -> use cpu
if >=0 -> use gpu
"""

self.model_name = model
try:
qa_model = AutoModelForQuestionAnswering.from_pretrained(
config.MODELS_DIR + self.model_name
)
qa_tokenizer = AutoTokenizer.from_pretrained(
config.MODELS_DIR + self.model_name
)
except Exception as _e:
print(_e)
sys.exit(f"Make sure that the model exists under {config.MODELS_DIR}")
self.model = pipeline(
"question-answering",
model=qa_model,
tokenizer=qa_tokenizer,
framework="pt",
device=device,
)
self.extended_answer_size = extended_answer_size
self.num_answers_to_predict = num_answers_to_predict
self.handle_impossible_answer = handle_impossible_answer
self.max_answer_len = max_answer_len
self.max_question_len = max_question_len
self.max_seq_len = max_seq_len
self.doc_stride = doc_stride

def predict(self, question, documents, top_k=1):
"""
Use this method to return top_k answer(s) based on input
question and documents.
:param question : question string
:type question : str
:param documents : pd.DataFrame that contains 'context' and other data
:type documents : pandas DataFrame
:param topk : number of answers to return for each document (default is 1)
:returns top_k_answers : list of top_k number of Answer objects
"""

answers = []
best_overall_score = 0

assert type(documents) == pd.DataFrame
assert "context" in documents.columns

print(f"Predicting answers from {documents.shape[0]} document(s)...")
for index, doc in tqdm(documents.iterrows(), total=documents.shape[0]):
try:
predictions = self.model(
question=question,
context=doc["context"],
topk=self.num_answers_to_predict,
handle_impossible_answer=self.handle_impossible_answer,
max_answer_len=self.max_answer_len,
max_question_len=self.max_question_len,
max_seq_len=self.max_seq_len,
doc_stride=self.doc_stride,
)
# reason for KeyError: https://github.com/huggingface/transformers/issues/5910
except KeyError as _e:
continue
except Exception as _other_e:
print(_other_e)
continue

# If only 1 answer is requested (self.num_answers_to_predict) transformers returns a dict
if type(predictions) == dict:
predictions = [predictions]

best_score = 0
for pred in predictions:
if pred["answer"]:
if pred["score"] > best_score:
best_score = pred["score"]
answer = self._create_answer_object(question, pred, doc)
answers.append(answer)
else:
print("No answer was predicted for this document!")

if best_score > best_overall_score:
best_overall_score = best_score

# sort answers by their `confidence` and select top-k
sorted_answers = sorted(answers, key=lambda k: k.confidence, reverse=True)

top_k_answers = sorted_answers[:top_k]

return top_k_answers

def _create_answer_object(self, question, pred, doc):
extended_start = max(0, pred["start"] - self.extended_answer_size)
extended_end = min(len(doc.context), pred["end"] + self.extended_answer_size)
# drop extra metadata columns
# errors ignored for when we have Question metadata and the 'body' column doesn't exist
metadata = (
doc.drop(["context", "body", "query"], errors="ignore")
.rename({"question": "most_similar_question"}, axis=1)
.to_dict()
)
answer = Answer(
question=question,
model=self.model_name,
answer=pred["answer"],
start=pred["start"],
end=pred["end"],
confidence=pred["score"],
extended_answer=doc.context[extended_start:extended_end],
extended_start=extended_start,
extended_end=extended_end,
metadata=metadata,
)
return answer
Loading

0 comments on commit f792947

Please sign in to comment.