Merge pull request #29 from rucio/develop

expand to use FAQs, add AnswerDetection module, add build_donkeybot; closes #28 #27 #26 #18
rucio · Aug 22, 2020 · f792947 · f792947
2 parents 3e269f8 + 13e981f
commit f792947
Show file tree

Hide file tree

Showing 33 changed files with 1,389 additions and 73 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,15 +1,23 @@
 # folders
 /some-folder
 /.vscode
-/data
 /virt
+/models
 
 # build info
 /dist 
 .eggs/
 /DonkeyBot.egg-info
 /lib/DonkeyBot.egg-info
 
+# data (created locally on your machine after running build_donkeybot.py)
+/data/old
+/data/conversation_dict.pickle
+/data/data_storage.db
+/data/docs_input_data.db
+/data/emails_input_data.db
+/data/issues_input_data.db
+
 # files
 __pycache__
 .pytest_cache
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -0,0 +1,2 @@
+## Individual contributors to the source code
+- Vasilis Mageirakos <b.mageirakos@gmail.com>, 202
diff --git a/data/faq.json b/data/faq.json
@@ -0,0 +1,41 @@
+[{
+        "answer": "You can find the contact information for all authors and contributors to Rucio on:\nhttps://github.com/rucio/rucio/blob/master/AUTHORS.rst",
+        "author": "Vasilis",
+        "created_at": "2020-08-22 13:18:10+00:00",
+        "faq_id": "faq_847d5843a5e040c6ae272f01110d7728",
+        "keywords": "authors,contributors",
+        "question": "Who are the Rucio authors?"
+    },
+    {
+        "answer": "To register a data set you can use the python replica client:\nhttps://github.com/rucio/rucio/blob/de7ad68cabe8bcd39a2e7301ffab2642e2b70256/lib/rucio/client/replicaclient.py#L195",
+        "author": "Martin",
+        "created_at": "2020-08-22 13:24:18+00:00",
+        "faq_id": "faq_a754aa4a921b4239abbc7e578bf4a73f",
+        "keywords": "dataset,register",
+        "question": "What is the command to use to register a data set?"
+    },
+    {
+        "answer": "If you define the RSE as non-deterministic, you can rely on your own directory structure and just have to register the PFN where the file is located as well.\nIn case of a deterministic RSE, you would have to place the files based on the deterministic function being used (Most likely the hashing).\nIn that case you can do the registration as well, but Rucio would require that the file is based in a directory based on the has function.",
+        "author": "Martin",
+        "created_at": "2020-08-22 13:25:55+00:00",
+        "faq_id": "faq_74551d3125cb4925b0352092b2732167",
+        "keywords": "dataset,directory,structure",
+        "question": "What is the logic behind the dataset directory structuring?"
+    },
+    {
+        "answer": "Not easily, this would require manual changes in the database. \nFor example, you would need to update all rules that use the RSE name in their RSE expression.",
+        "author": "Dimitrios",
+        "created_at": "2020-08-22 13:27:01+00:00",
+        "faq_id": "faq_d34159f4297c4eb6894820dbe42587d5",
+        "keywords": "rse,rename",
+        "question": "Is it possible to rename an RSE?"
+    },
+    {
+        "answer": "The full schema in SQL Alchemy description is at https://github.com/rucio/rucio/blob/master/lib/rucio/db/sqla/models.py\nThis is the one being used if you initiate the database with alembic.\nIn parallel we also maintain a schema file for oracle, this is equivalent to the models file from sqlalchemy, but has some oracle specific optimizations.\nhttps://github.com/rucio/rucio/blob/master/etc/sql/oracle/schema.sql",
+        "author": "Martin",
+        "created_at": "2020-08-22 13:27:49+00:00",
+        "faq_id": "faq_98135db779aa4ead9544b816fadbb654",
+        "keywords": "database,schema",
+        "question": "Where can I see the database schema of the Rucio DB?"
+    }
+]
diff --git a/lib/bot/detector/__init__.py → lib/bot/answer/__init__.py b/lib/bot/detector/__init__.py → lib/bot/answer/__init__.py
diff --git a/lib/bot/answer/base.py b/lib/bot/answer/base.py
@@ -0,0 +1,55 @@
+# general python
+from uuid import uuid4
+import datetime
+import hashlib
+import re
+
+
+class Answer:
+    def __init__(
+        self,
+        question,
+        answer,
+        model,
+        start,
+        end,
+        confidence,
+        extended_answer,
+        extended_start,
+        extended_end,
+        metadata,
+    ):
+        # Set unique ID
+        self.id = str(uuid4().hex)
+        self.user_question = question
+        # Since multiple answers can be created for the same user_question
+        # Let's create an id for the user_question
+        clean_question = str(question).lower()
+        # disregard all trailing question marks and spaces from the hashing
+        if clean_question[-1] == "?":
+            clean_question = re.sub("[ ?]*$", "", clean_question)
+        self.user_question_id = hashlib.md5(clean_question.encode("utf-8")).hexdigest()[
+            :10
+        ]
+        self.answer = answer
+        self.start = start
+        self.end = end
+        self.confidence = confidence
+        self.extended_answer = extended_answer
+        self.extended_start = extended_start
+        self.extended_end = extended_end
+        self.model = model
+        # TODO add FAQ option as an origin
+        if "doc_id" in metadata:
+            self.origin = "documentation"
+        elif "faq_id" in metadata:
+            self.origin = "faq"
+        else:
+            self.origin = "questions"
+        # +00:00 since its utcnow() + same format as other dates saved in data_storage
+        self.created_at = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S+00:00")
+        self.label = None
+        self.metadata = metadata
+
+    def __str__(self):
+        return f"answer: {self.extended_answer}... , confidence: {self.confidence}''"
diff --git a/lib/bot/answer/detector.py b/lib/bot/answer/detector.py
@@ -0,0 +1,170 @@
+# Donkeybot's AnswerDetector utilizes Hugginface's Transformers
+# Usefull links:
+# 1) https://huggingface.co/transformers/task_summary.html#extractive-question-answering  (example)
+# 2) https://huggingface.co/transformers/model_doc/bert.html   (bert)
+# 3) https://huggingface.co/transformers/main_classes/tokenizer.html (tokenizer)
+# 4) https://stackoverflow.com/questions/59701981/bert-tokenizer-model-download (tokenizer)
+# 5) https://huggingface.co/transformers/pretrained_models.html  (models)
+# 6) https://huggingface.co/transformers/_modules/transformers/pipelines.html (pipelines)
+
+# bot modules
+import bot.config as config
+from bot.answer.base import Answer
+
+# general python
+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+from tqdm import tqdm
+import pandas as pd
+import sys
+
+
+class AnswerDetector:
+    """Answer Detector"""
+
+    def __init__(
+        self,
+        model="distilbert-base-cased-distilled-squad",
+        extended_answer_size=30,
+        handle_impossible_answer=True,
+        max_answer_len=25,
+        max_question_len=64,
+        max_seq_len=256,
+        num_answers_to_predict=3,
+        doc_stride=128,
+        device=0,
+    ):
+        """
+        <!> Default values from source code for transformers.pipelines:
+           ("topk", 1)
+           ("doc_stride", 128)
+           ("max_answer_len", 15)
+           ("max_seq_len", 384)
+           ("max_question_len", 64)
+           ("handle_impossible_answer", False)
+
+        :param model : name of the transformer model for QA (default is distilbert-base-cased-distilled-squad)
+        :param extended_answer_size : Number of character before and after the answer detected by our
+                                      model that are returned to give more context for the user. (default is 30)
+        :param handle_impossible_answer : True if we wish to return impossible/empty answers, False otherwise (default is True)
+        :param max_answer_len : maximum length of an answer (default is 25)
+        :param max_question_len : maximum length of a question (default is 64)
+        :param max_seq_len : maximum length of one input sequence (default 256)
+        :param num_answers_to_predict : num of answers that are predicted per document (default is 3)
+        :param doc_stride : length of the split in the sliding window documents longer than max_sq_len.
+        :param device : if < 0 -> use cpu
+                        if >=0 -> use gpu 
+        """
+
+        self.model_name = model
+        try:
+            qa_model = AutoModelForQuestionAnswering.from_pretrained(
+                config.MODELS_DIR + self.model_name
+            )
+            qa_tokenizer = AutoTokenizer.from_pretrained(
+                config.MODELS_DIR + self.model_name
+            )
+        except Exception as _e:
+            print(_e)
+            sys.exit(f"Make sure that the model exists under {config.MODELS_DIR}")
+        self.model = pipeline(
+            "question-answering",
+            model=qa_model,
+            tokenizer=qa_tokenizer,
+            framework="pt",
+            device=device,
+        )
+        self.extended_answer_size = extended_answer_size
+        self.num_answers_to_predict = num_answers_to_predict
+        self.handle_impossible_answer = handle_impossible_answer
+        self.max_answer_len = max_answer_len
+        self.max_question_len = max_question_len
+        self.max_seq_len = max_seq_len
+        self.doc_stride = doc_stride
+
+    def predict(self, question, documents, top_k=1):
+        """
+        Use this method to return top_k answer(s) based on input 
+        question and documents.
+
+        :param question  : question string
+        :type question   : str
+        :param documents : pd.DataFrame that contains 'context' and other data
+        :type documents  : pandas DataFrame 
+        :param topk      : number of answers to return for each document (default is 1)
+        :returns top_k_answers : list of top_k number of Answer objects
+        """
+
+        answers = []
+        best_overall_score = 0
+
+        assert type(documents) == pd.DataFrame
+        assert "context" in documents.columns
+
+        print(f"Predicting answers from {documents.shape[0]} document(s)...")
+        for index, doc in tqdm(documents.iterrows(), total=documents.shape[0]):
+            try:
+                predictions = self.model(
+                    question=question,
+                    context=doc["context"],
+                    topk=self.num_answers_to_predict,
+                    handle_impossible_answer=self.handle_impossible_answer,
+                    max_answer_len=self.max_answer_len,
+                    max_question_len=self.max_question_len,
+                    max_seq_len=self.max_seq_len,
+                    doc_stride=self.doc_stride,
+                )
+            # reason for KeyError: https://github.com/huggingface/transformers/issues/5910
+            except KeyError as _e:
+                continue
+            except Exception as _other_e:
+                print(_other_e)
+                continue
+
+            # If only 1 answer is requested (self.num_answers_to_predict) transformers returns a dict
+            if type(predictions) == dict:
+                predictions = [predictions]
+
+            best_score = 0
+            for pred in predictions:
+                if pred["answer"]:
+                    if pred["score"] > best_score:
+                        best_score = pred["score"]
+                    answer = self._create_answer_object(question, pred, doc)
+                    answers.append(answer)
+                else:
+                    print("No answer was predicted for this document!")
+
+                if best_score > best_overall_score:
+                    best_overall_score = best_score
+
+        # sort answers by their `confidence` and select top-k
+        sorted_answers = sorted(answers, key=lambda k: k.confidence, reverse=True)
+
+        top_k_answers = sorted_answers[:top_k]
+
+        return top_k_answers
+
+    def _create_answer_object(self, question, pred, doc):
+        extended_start = max(0, pred["start"] - self.extended_answer_size)
+        extended_end = min(len(doc.context), pred["end"] + self.extended_answer_size)
+        # drop extra metadata columns
+        # errors ignored for when we have Question metadata and the 'body' column doesn't exist
+        metadata = (
+            doc.drop(["context", "body", "query"], errors="ignore")
+            .rename({"question": "most_similar_question"}, axis=1)
+            .to_dict()
+        )
+        answer = Answer(
+            question=question,
+            model=self.model_name,
+            answer=pred["answer"],
+            start=pred["start"],
+            end=pred["end"],
+            confidence=pred["score"],
+            extended_answer=doc.context[extended_start:extended_end],
+            extended_start=extended_start,
+            extended_end=extended_end,
+            metadata=metadata,
+        )
+        return answer