## Initialisation

In [None]:
# Check for running GPU
!nvidia-smi

In [None]:
# Run once for initial installs
! pip install farm-haystack
! pip install git+https://github.com/deepset-ai/haystack.git
! pip install urllib3==1.25.4
! pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
# Run this if want to see graphic representation of pipeline 
! apt install libgraphviz-dev graphviz
! pip install pygraphviz

In [1]:
# Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2
! sleep 30

In [3]:
# Connect to Elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)
                  )
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(similarity="dot_product", host="localhost", username="", password="", index="document")

02/14/2021 02:53:20 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.127s]
02/14/2021 02:53:20 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.014s]
02/14/2021 02:53:20 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.071s]
02/14/2021 02:53:20 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.018s]


In [4]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

## Document Preprocessing

In [5]:
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
import haystack

converter = haystack.file_converter.txt.TextConverter(
                    remove_numeric_tables=False,
                    valid_languages = ["en"])

as4 = converter.convert(file_path="/content/as4-winterBarley.txt")


processor = haystack.preprocessor.preprocessor.PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="passage",
    split_length=1,
    split_respect_sentence_boundary=False,
    split_overlap=0
)

as4Docs = processor.process(as4)
# print(as4Docs)

# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(
    host="localhost",
    username="",
    password="",
    index="document"
)

document_store.delete_all_documents(index='document')
document_store.write_documents(as4Docs)

backagain = document_store.get_all_documents();

for i in range(0,len(as4Docs)):
    print(str(i) + ":", end = " ")
    print(as4Docs[i])

[nltk_data] Downloading package punkt to /root/nltk_data...


02/14/2021 02:53:27 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.006s]
02/14/2021 02:53:27 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.003s]
02/14/2021 02:53:27 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.015s]
02/14/2021 02:53:27 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.004s]


[nltk_data]   Package punkt is already up-to-date!


02/14/2021 02:53:27 - INFO - elasticsearch -   POST http://localhost:9200/document/_delete_by_query [status:200 request:0.209s]
02/14/2021 02:53:30 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.978s]
02/14/2021 02:53:30 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=1d&size=10000 [status:200 request:0.023s]
02/14/2021 02:53:30 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.005s]
02/14/2021 02:53:30 - INFO - elasticsearch -   DELETE http://localhost:9200/_search/scroll [status:200 request:0.004s]


0: {'text': 'Three-spray programmes are recommended for winter barley due to its response to T3 fungicides and the rise in severity of late season ramularia. As such T1 and T2 can be applied slightly earlier if required to better match up with PGR timings.', 'meta': {'_split_id': 0}}
1: {'text': 'The main fungicide used for winter barley disease control is Prothioconazole; however growers should look to protect this active by including alternative modes of action.', 'meta': {'_split_id': 1}}
2: {'text': 'For winter barley, timing for T1 (GS30-31) is important due to more GAI on lower than on upper leaves. The slightly earlier timing will allow better pairing with an early growth regulator and reduce the need for a T0 treatment.', 'meta': {'_split_id': 2}}
3: {'text': 'For winter barley, T1 (GS30-31) application options are:\nSiltra Xpro 0.4 -0.6 l/ha\nCebara 1.0 -1.5 l/ha + Proline 0.25 l/ha,\nComet 0.4 l/ha + Proline 0.3-0.4 l/ha\nFandango 0.75 -1.0 l/ha\nElatus Era 0.6 -0.8 l/ha\nCyp

## Building Individual Components

In [6]:
# Naive retriver based on tf * idf - Default BM25, can be cunstomised
from haystack.retriever.sparse import ElasticsearchRetriever
es_retriever = ElasticsearchRetriever(document_store=document_store)

In [7]:
# Alternative retriever - double BERT neural networks for question and doc embedding
from haystack.retriever.dense import DensePassageRetriever
dpr_retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=64,
                                  max_seq_len_passage=256,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)

In [8]:
# Alternative retriever - single BERT to embed both question and doc, may be better for similar documents (our case)
from haystack.retriever.dense import EmbeddingRetriever
embedding_retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="deepset/sentence_bert")

02/14/2021 02:54:22 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model deepset/sentence_bert
02/14/2021 02:54:22 - INFO - farm.utils -   Using device: CUDA 
02/14/2021 02:54:22 - INFO - farm.utils -   Number of GPUs: 1
02/14/2021 02:54:22 - INFO - farm.utils -   Distributed Training: False
02/14/2021 02:54:22 - INFO - farm.utils -   Automatic Mixed Precision: None
02/14/2021 02:54:38 - INFO - farm.utils -   Using device: CUDA 
02/14/2021 02:54:38 - INFO - farm.utils -   Number of GPUs: 1
02/14/2021 02:54:38 - INFO - farm.utils -   Distributed Training: False
02/14/2021 02:54:38 - INFO - farm.utils -   Automatic Mixed Precision: None


In [9]:
# Customised retriever - add in future filter for special key words?
from haystack.retriever.dense import BaseRetriever
class CustomRetriever(BaseRetriever):
  def retrieve(self,query,filters=None,top_k=10,index=None):
    super().retrieve(query,filters,top_k,index)
    #placeholder retriever
    return []

custom_retriever = CustomRetriever()

In [10]:
# Reader to further scan with Hugging Face models
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=True)

02/14/2021 02:54:38 - INFO - farm.utils -   Using device: CUDA 
02/14/2021 02:54:38 - INFO - farm.utils -   Number of GPUs: 1
02/14/2021 02:54:38 - INFO - farm.utils -   Distributed Training: False
02/14/2021 02:54:38 - INFO - farm.utils -   Automatic Mixed Precision: None
02/14/2021 02:56:11 - INFO - farm.utils -   Using device: CUDA 
02/14/2021 02:56:11 - INFO - farm.utils -   Number of GPUs: 1
02/14/2021 02:56:11 - INFO - farm.utils -   Distributed Training: False
02/14/2021 02:56:11 - INFO - farm.utils -   Automatic Mixed Precision: None
02/14/2021 02:56:11 - INFO - farm.infer -   Got ya 2 parallel workers to do inference ...
02/14/2021 02:56:11 - INFO - farm.infer -    0    0 
02/14/2021 02:56:11 - INFO - farm.infer -   /w\  /w\
02/14/2021 02:56:11 - INFO - farm.infer -   /'\  / \
02/14/2021 02:56:11 - INFO - farm.infer -     


In [11]:
document_store.update_embeddings(dpr_retriever) #possible training of dpr model

02/14/2021 02:56:11 - INFO - elasticsearch -   POST http://localhost:9200/document/_count [status:200 request:0.074s]
02/14/2021 02:56:11 - INFO - haystack.document_store.elasticsearch -   Updating embeddings for all 14 docs ...
02/14/2021 02:56:11 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=1d&size=10000 [status:200 request:0.011s]
02/14/2021 02:56:11 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.005s]
02/14/2021 02:56:11 - INFO - elasticsearch -   DELETE http://localhost:9200/_search/scroll [status:200 request:0.004s]
02/14/2021 02:56:12 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.517s]


In [18]:
# For winter barley only timing is necessary, to be completed for the full document
filters_dictionary = { "timings":["t0","t1","t2","t3"],
                       "fungus":["ramularia", "rellow rust"],
                       "areas":["east", "north", "southeast", "west", "south", "southwest"]}
questions = { "timings":"Is there a specific timing that you would like to ask about? (E.g. T0, T1, etc)",
              "fungus":"Is there a fungi type that you would want to know about specifically?",
              "area":"Which area are you in? (E.g. east, north, etc)"}

class FurtherQuestionGenerator:
  outgoing_edges = 1

  def individualFiltersGenerator(self, text):
    current_filters = {}
    for category, filters in filters_dictionary.items():
      for filter in filters:
        if filter in text.lower():
          if category in current_filters:
            current_filters[category].append(filter)
          else:
            current_filters[category] = [filter]
    return current_filters

  def topDocsFilterGenerator(self, docs):
    return [self.individualFiltersGenerator(doc.text) for doc in docs]

  def filters_difference(self, filters_list, specified = []):
    current_filters = {}
    for filters in filters_list:
      for category, filters in filters.items():
        if ((category not in specified) and (category in current_filters) and (filters != current_filters[category])):
          return category
        elif ((category not in specified) and (category not in current_filters)):
          current_filters[category] = filters
    return None

  def furtherQuestions(self, docs,specified = []):
    filters_list = self.topDocsFilterGenerator(docs)
    match = [0 for doc in docs]
    keyword = self.filters_difference(filters_list,specified)
    while keyword is not None:
      new_key = input(questions[keyword] + " ")
      match = [match[i] + 1 if ((keyword in filters_list[i].keys()) and (new_key.lower() in filters_list[i][keyword])) else match[i] for i in range(len(filters_list))]
      match = [match[i] + 1 if ((keyword in filters_list[i].keys()) and (filters_list[i][keyword] == [new_key.lower()])) else match[i] for i in range(len(filters_list))]
      specified.append(keyword)
      keyword = self.filters_difference(filters_list,specified)
    return [x for _,x in sorted(zip(match,docs), key=lambda pair: pair[0], reverse=True)]

  def run(self, **kwargs):
    specified = list(self.individualFiltersGenerator(kwargs["query"]).keys())
    return (self.furtherQuestions(kwargs["documents"],specified),"output_1")

question_generator = FurtherQuestionGenerator()

## Assembling into Pipeline

In [12]:
#from haystack.pipeline import ExtractiveQAPipeline

# Original Finder deprecated, pipeline allows more flexibility
# prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)

# top_k_retriever -> the more retriever the more document scanned in Reader, slower but higher hit rate
#extractive_pipeline = ExtractiveQAPipeline(reader=reader, retriever=dpr_retriever) # Other options: Document Search, Generative, FAQ

In [14]:
# Naive approach - can fall back upon this if anything goes wrong
# question = 'what are the T1 appkication option for winter barley?'
# prediction = extractive_pipeline.run(query=question, top_k_retriever=2, top_k_reader=2)
# print_answers(prediction, details="all") #details: all, medium, minimal
# data format: {query,'answers':[{'answer','score','probability','context','document_id','offset','meta'}]}
# pp.pprint(prediction['answers'])
# pp.pprint(dpr_retriever.retrieve(query=question,top_k=13))

In [22]:
# Current approach
from haystack import Pipeline
from haystack.pipeline import JoinDocuments

# Building new pipeline with multiple retrievers
pipeline = Pipeline()
pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
pipeline.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
pipeline.add_node(component=custom_retriever, name="CustomRetriever", inputs=["Query"])
pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever", "EmbeddingRetriever", "CustomRetriever"])
pipeline.add_node(component=question_generator, name="QnGenerator", inputs=["JoinResults"])
# pipeline.draw(path="custom_pipe.png")

# Question input goes here
question = input("What would you like to ask about?")
responses = pipeline.run(query=question, top_k_retriever=5)

# Final answer is here 
print(responses[0].text)

What would you like to ask about?what should I spray for winter barley?


02/14/2021 03:02:40 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.009s]
02/14/2021 03:02:40 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.011s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 17.40 Batches/s]
02/14/2021 03:02:40 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.012s]


Is there a specific timing that you would like to ask about? (E.g. T0, T1, etc) T0
A three-spray programme for winter barley allows chlorothalonil (CTL) to be used with the latter two sprays, (so no requirement at T1) but if employing a two-spray programme (or three-sprays at T0, T1 and T2) then it would be wise to include chlorothalonil (CTL) at T1.
