In [1]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [3]:
import re
from haystack import Document
# tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fp = open("./data/ACI_all.txt", encoding="utf8")
data = fp.read()
data = re.sub("\n+", "\n", data)
docs =[{"content": paragraph} for paragraph in data.splitlines()]


In [1]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(index='haystack-lfqa',
                                       similarity="cosine",
                                       embedding_dim=768)
document_store.delete_documents()
document_store.write_documents(docs)

In [1]:
from haystack.document_stores import FAISSDocumentStore
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
# document_store = FAISSDocumentStore.load("documents_updated.faiss")
document_store.delete_documents()
document_store.write_documents(docs)

# document_store = FAISSDocumentStore.load("./documents_updated.faiss")

In [5]:
import torch
# confirm GPU is available, outputs True if so
# torch.version.cuda()
torch.cuda.is_available()
# torch.__version__

True

In [5]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
   document_store=document_store,
   embedding_model="./all_datasets_v3_mpnet-base",
   model_format="sentence_transformers"
)

# git lfs install
# git clone https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base

In [6]:
document_store.update_embeddings(
   retriever,
   batch_size=128
)

Updating Embedding:   0%|          | 0/422 [00:00<?, ? docs/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

AttributeError: 'InMemoryDocumentStore' object has no attribute 'save'

In [8]:
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

search_pipe = DocumentSearchPipeline(retriever)
result = search_pipe.run(
    query="What is the largest business of ACI?",
    params={"Retriever": {"top_k": 2}}
)

print_documents(result)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What is the largest business of ACI?

{   'content': 'Advanced Chemical Industries (ACI) Limited, being one of the '
               'largest conglomerates[4] in Bangladesh with a multinational '
               'heritage operates across the country through its four '
               'diversified strategic business units.',
    'name': None}

{   'content': 'Advanced Chemical Industries Ltd. Or ACI Ltd. (ACI Limited) is '
               'one of the largest conglomerates in Bangladesh. ACI sells a '
               'total of 39 categories of products in the four sectors of '
               'Healthcare, Consumer Goods and Electronics, Agriculture, and '
               'Retail. With a market value of Tk 1500 crore and a revenue of '
               'Tk 6300 crore, the conglomerate has 14 subsidiaries, 4 joint '
               'venture and associate companies, and 12 manufacturing plants. '
               'Despite being involved in so many businesses, ACI’s profit '
               'gene

In [10]:
from haystack.nodes import Seq2SeqGenerator
from transformers import AutoTokenizer
from collections.abc import Callable
tokenizer = AutoTokenizer.from_pretrained("./bart_lfqa")
query="Summarise ACI Limited."

# type(Callable)
# class Conv(Callable):
#
#     def __call__(self, tokenizer=tokenizer, query=query,: str, documents: List[Document],
# #                                 top_k: Optional[int] = None):
# def input_converter(tokenizer: PreTrainedTokenizer, query: str, documents: List[Document],
#                                 top_k: Optional[int] = None):
#
#
#
# conv = Callable
import pickle
# generator =
generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")



In [13]:
import pickle
# file = open('important', 'wb')
pickle.dump(generator, open('generator_win.pkl', 'wb'))

In [14]:
import pickle
generator = pickle.load(open("generator_win.pkl", "rb"))

In [15]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator, retriever)

In [16]:


result = pipe.run(
        query="Summarise ACI Limited.",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

result["answers"][0].answer



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'ACI is one of the largest conglomerates in Bangladesh, with 14 subsidiaries, 4 joint venture and associate companies, and 12 manufacturing plants. It sells a total of 39 categories of products in the four sectors of Healthcare, Consumer Goods and Electronics, Agriculture, and Retail.'

In [None]:
from haystack.nodes import TransformersSummarizer

summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")
summary = summarizer.predict(documents=document_store.get_all_documents())

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [31]:
for context in result["documents"]:
    print(context.content+"\n")

ACI HealthCare Limited, a public limited company, is a subsidiary of Advanced Chemical Industries (ACI) Limited was incorporated in 18 February 2013 under the Companies Act 1994.

ACI introduced the concept of quality management system by being the first company in Bangladesh to achieve ISO 9001 certification in 1995 and follows the policy of continuous improvement in all its operations.

ACI Marine & Riverine Technologies (The Company) was incorporated in December 2019. The company is driven with a mission of 'Improving Quality of Life through responsible application of technology and services in Marine & Riverine operation'.



In [None]:
result.keys()

dict_keys(['query', 'answers', 'documents', 'root_node', 'params', 'node_id'])

In [None]:
result["documents"]

[<Document: {'content': 'With almost three decades of partnering life and engendering hope, ACI is one of the top pharmaceutical companies in Bangladesh, employing more than 5,000 people all over the country. As a progressive and forward-thinking company, ACI Pharma is dedicated to improve the health of people of Bangladesh through introduction of innovative and reliable pharmaceutical products.', 'content_type': 'text', 'score': 0.5016781436987151, 'meta': {}, 'embedding': None, 'id': '88f16502772e9c62fb74182e0cd36423'}>,
 <Document: {'content': 'Advanced Chemical Industries (ACI) Limited is one of the leading conglomerates in Bangladesh, with a multinational heritage. We have a mission to achieve business excellence through quality by understanding, accepting, meeting and exceeding customer expectations. We follow International Standards on Quality Management System to ensure consistent quality of products and services to achieve customer satisfaction.', 'content_type': 'text', 'scor