This repository has been archived by the owner on Mar 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from rucio/develop
- Loading branch information
Showing
33 changed files
with
1,389 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,23 @@ | ||
# folders | ||
/some-folder | ||
/.vscode | ||
/data | ||
/virt | ||
/models | ||
|
||
# build info | ||
/dist | ||
.eggs/ | ||
/DonkeyBot.egg-info | ||
/lib/DonkeyBot.egg-info | ||
|
||
# data (created locally on your machine after running build_donkeybot.py) | ||
/data/old | ||
/data/conversation_dict.pickle | ||
/data/data_storage.db | ||
/data/docs_input_data.db | ||
/data/emails_input_data.db | ||
/data/issues_input_data.db | ||
|
||
# files | ||
__pycache__ | ||
.pytest_cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
## Individual contributors to the source code | ||
- Vasilis Mageirakos <b.mageirakos@gmail.com>, 202 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
[{ | ||
"answer": "You can find the contact information for all authors and contributors to Rucio on:\nhttps://github.com/rucio/rucio/blob/master/AUTHORS.rst", | ||
"author": "Vasilis", | ||
"created_at": "2020-08-22 13:18:10+00:00", | ||
"faq_id": "faq_847d5843a5e040c6ae272f01110d7728", | ||
"keywords": "authors,contributors", | ||
"question": "Who are the Rucio authors?" | ||
}, | ||
{ | ||
"answer": "To register a data set you can use the python replica client:\nhttps://github.com/rucio/rucio/blob/de7ad68cabe8bcd39a2e7301ffab2642e2b70256/lib/rucio/client/replicaclient.py#L195", | ||
"author": "Martin", | ||
"created_at": "2020-08-22 13:24:18+00:00", | ||
"faq_id": "faq_a754aa4a921b4239abbc7e578bf4a73f", | ||
"keywords": "dataset,register", | ||
"question": "What is the command to use to register a data set?" | ||
}, | ||
{ | ||
"answer": "If you define the RSE as non-deterministic, you can rely on your own directory structure and just have to register the PFN where the file is located as well.\nIn case of a deterministic RSE, you would have to place the files based on the deterministic function being used (Most likely the hashing).\nIn that case you can do the registration as well, but Rucio would require that the file is based in a directory based on the has function.", | ||
"author": "Martin", | ||
"created_at": "2020-08-22 13:25:55+00:00", | ||
"faq_id": "faq_74551d3125cb4925b0352092b2732167", | ||
"keywords": "dataset,directory,structure", | ||
"question": "What is the logic behind the dataset directory structuring?" | ||
}, | ||
{ | ||
"answer": "Not easily, this would require manual changes in the database. \nFor example, you would need to update all rules that use the RSE name in their RSE expression.", | ||
"author": "Dimitrios", | ||
"created_at": "2020-08-22 13:27:01+00:00", | ||
"faq_id": "faq_d34159f4297c4eb6894820dbe42587d5", | ||
"keywords": "rse,rename", | ||
"question": "Is it possible to rename an RSE?" | ||
}, | ||
{ | ||
"answer": "The full schema in SQL Alchemy description is at https://github.com/rucio/rucio/blob/master/lib/rucio/db/sqla/models.py\nThis is the one being used if you initiate the database with alembic.\nIn parallel we also maintain a schema file for oracle, this is equivalent to the models file from sqlalchemy, but has some oracle specific optimizations.\nhttps://github.com/rucio/rucio/blob/master/etc/sql/oracle/schema.sql", | ||
"author": "Martin", | ||
"created_at": "2020-08-22 13:27:49+00:00", | ||
"faq_id": "faq_98135db779aa4ead9544b816fadbb654", | ||
"keywords": "database,schema", | ||
"question": "Where can I see the database schema of the Rucio DB?" | ||
} | ||
] |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# general python | ||
from uuid import uuid4 | ||
import datetime | ||
import hashlib | ||
import re | ||
|
||
|
||
class Answer: | ||
def __init__( | ||
self, | ||
question, | ||
answer, | ||
model, | ||
start, | ||
end, | ||
confidence, | ||
extended_answer, | ||
extended_start, | ||
extended_end, | ||
metadata, | ||
): | ||
# Set unique ID | ||
self.id = str(uuid4().hex) | ||
self.user_question = question | ||
# Since multiple answers can be created for the same user_question | ||
# Let's create an id for the user_question | ||
clean_question = str(question).lower() | ||
# disregard all trailing question marks and spaces from the hashing | ||
if clean_question[-1] == "?": | ||
clean_question = re.sub("[ ?]*$", "", clean_question) | ||
self.user_question_id = hashlib.md5(clean_question.encode("utf-8")).hexdigest()[ | ||
:10 | ||
] | ||
self.answer = answer | ||
self.start = start | ||
self.end = end | ||
self.confidence = confidence | ||
self.extended_answer = extended_answer | ||
self.extended_start = extended_start | ||
self.extended_end = extended_end | ||
self.model = model | ||
# TODO add FAQ option as an origin | ||
if "doc_id" in metadata: | ||
self.origin = "documentation" | ||
elif "faq_id" in metadata: | ||
self.origin = "faq" | ||
else: | ||
self.origin = "questions" | ||
# +00:00 since its utcnow() + same format as other dates saved in data_storage | ||
self.created_at = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S+00:00") | ||
self.label = None | ||
self.metadata = metadata | ||
|
||
def __str__(self): | ||
return f"answer: {self.extended_answer}... , confidence: {self.confidence}''" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Donkeybot's AnswerDetector utilizes Hugginface's Transformers | ||
# Usefull links: | ||
# 1) https://huggingface.co/transformers/task_summary.html#extractive-question-answering (example) | ||
# 2) https://huggingface.co/transformers/model_doc/bert.html (bert) | ||
# 3) https://huggingface.co/transformers/main_classes/tokenizer.html (tokenizer) | ||
# 4) https://stackoverflow.com/questions/59701981/bert-tokenizer-model-download (tokenizer) | ||
# 5) https://huggingface.co/transformers/pretrained_models.html (models) | ||
# 6) https://huggingface.co/transformers/_modules/transformers/pipelines.html (pipelines) | ||
|
||
# bot modules | ||
import bot.config as config | ||
from bot.answer.base import Answer | ||
|
||
# general python | ||
from transformers import pipeline | ||
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | ||
from tqdm import tqdm | ||
import pandas as pd | ||
import sys | ||
|
||
|
||
class AnswerDetector: | ||
"""Answer Detector""" | ||
|
||
def __init__( | ||
self, | ||
model="distilbert-base-cased-distilled-squad", | ||
extended_answer_size=30, | ||
handle_impossible_answer=True, | ||
max_answer_len=25, | ||
max_question_len=64, | ||
max_seq_len=256, | ||
num_answers_to_predict=3, | ||
doc_stride=128, | ||
device=0, | ||
): | ||
""" | ||
<!> Default values from source code for transformers.pipelines: | ||
("topk", 1) | ||
("doc_stride", 128) | ||
("max_answer_len", 15) | ||
("max_seq_len", 384) | ||
("max_question_len", 64) | ||
("handle_impossible_answer", False) | ||
:param model : name of the transformer model for QA (default is distilbert-base-cased-distilled-squad) | ||
:param extended_answer_size : Number of character before and after the answer detected by our | ||
model that are returned to give more context for the user. (default is 30) | ||
:param handle_impossible_answer : True if we wish to return impossible/empty answers, False otherwise (default is True) | ||
:param max_answer_len : maximum length of an answer (default is 25) | ||
:param max_question_len : maximum length of a question (default is 64) | ||
:param max_seq_len : maximum length of one input sequence (default 256) | ||
:param num_answers_to_predict : num of answers that are predicted per document (default is 3) | ||
:param doc_stride : length of the split in the sliding window documents longer than max_sq_len. | ||
:param device : if < 0 -> use cpu | ||
if >=0 -> use gpu | ||
""" | ||
|
||
self.model_name = model | ||
try: | ||
qa_model = AutoModelForQuestionAnswering.from_pretrained( | ||
config.MODELS_DIR + self.model_name | ||
) | ||
qa_tokenizer = AutoTokenizer.from_pretrained( | ||
config.MODELS_DIR + self.model_name | ||
) | ||
except Exception as _e: | ||
print(_e) | ||
sys.exit(f"Make sure that the model exists under {config.MODELS_DIR}") | ||
self.model = pipeline( | ||
"question-answering", | ||
model=qa_model, | ||
tokenizer=qa_tokenizer, | ||
framework="pt", | ||
device=device, | ||
) | ||
self.extended_answer_size = extended_answer_size | ||
self.num_answers_to_predict = num_answers_to_predict | ||
self.handle_impossible_answer = handle_impossible_answer | ||
self.max_answer_len = max_answer_len | ||
self.max_question_len = max_question_len | ||
self.max_seq_len = max_seq_len | ||
self.doc_stride = doc_stride | ||
|
||
def predict(self, question, documents, top_k=1): | ||
""" | ||
Use this method to return top_k answer(s) based on input | ||
question and documents. | ||
:param question : question string | ||
:type question : str | ||
:param documents : pd.DataFrame that contains 'context' and other data | ||
:type documents : pandas DataFrame | ||
:param topk : number of answers to return for each document (default is 1) | ||
:returns top_k_answers : list of top_k number of Answer objects | ||
""" | ||
|
||
answers = [] | ||
best_overall_score = 0 | ||
|
||
assert type(documents) == pd.DataFrame | ||
assert "context" in documents.columns | ||
|
||
print(f"Predicting answers from {documents.shape[0]} document(s)...") | ||
for index, doc in tqdm(documents.iterrows(), total=documents.shape[0]): | ||
try: | ||
predictions = self.model( | ||
question=question, | ||
context=doc["context"], | ||
topk=self.num_answers_to_predict, | ||
handle_impossible_answer=self.handle_impossible_answer, | ||
max_answer_len=self.max_answer_len, | ||
max_question_len=self.max_question_len, | ||
max_seq_len=self.max_seq_len, | ||
doc_stride=self.doc_stride, | ||
) | ||
# reason for KeyError: https://github.com/huggingface/transformers/issues/5910 | ||
except KeyError as _e: | ||
continue | ||
except Exception as _other_e: | ||
print(_other_e) | ||
continue | ||
|
||
# If only 1 answer is requested (self.num_answers_to_predict) transformers returns a dict | ||
if type(predictions) == dict: | ||
predictions = [predictions] | ||
|
||
best_score = 0 | ||
for pred in predictions: | ||
if pred["answer"]: | ||
if pred["score"] > best_score: | ||
best_score = pred["score"] | ||
answer = self._create_answer_object(question, pred, doc) | ||
answers.append(answer) | ||
else: | ||
print("No answer was predicted for this document!") | ||
|
||
if best_score > best_overall_score: | ||
best_overall_score = best_score | ||
|
||
# sort answers by their `confidence` and select top-k | ||
sorted_answers = sorted(answers, key=lambda k: k.confidence, reverse=True) | ||
|
||
top_k_answers = sorted_answers[:top_k] | ||
|
||
return top_k_answers | ||
|
||
def _create_answer_object(self, question, pred, doc): | ||
extended_start = max(0, pred["start"] - self.extended_answer_size) | ||
extended_end = min(len(doc.context), pred["end"] + self.extended_answer_size) | ||
# drop extra metadata columns | ||
# errors ignored for when we have Question metadata and the 'body' column doesn't exist | ||
metadata = ( | ||
doc.drop(["context", "body", "query"], errors="ignore") | ||
.rename({"question": "most_similar_question"}, axis=1) | ||
.to_dict() | ||
) | ||
answer = Answer( | ||
question=question, | ||
model=self.model_name, | ||
answer=pred["answer"], | ||
start=pred["start"], | ||
end=pred["end"], | ||
confidence=pred["score"], | ||
extended_answer=doc.context[extended_start:extended_end], | ||
extended_start=extended_start, | ||
extended_end=extended_end, | ||
metadata=metadata, | ||
) | ||
return answer |
Oops, something went wrong.