# Importing `google` library to implement Google Search API

In [1]:
!pip install google

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Importing required dependencies and libraries
- `pandas`
- `re` - Regular Expressions
- `requests`
- `BeautifulSoup4`
- `os`
- `unicodedata`

In [2]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as bs
import os
import unicodedata

## Initiating session headers to make the HTTP requests behave as humanly as possible.

In [3]:
session = requests.Session()

session.headers['User-Agent']

'python-requests/2.23.0'

In [4]:
my_headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14685.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.4992.0 Safari/537.36",
              "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"}

## Implementing the google search API to get the relevant URLs list

In [5]:
urls = []

try:
    from googlesearch import search
except ImportError:
    print("No module named 'google' found")
 
# to search
query = "What is Blockchain?"
 
for j in search(query, tld="co.in", num=10, stop=10, pause=2):
  urls.append(j)

In [6]:
urls

['https://www.investopedia.com/terms/b/blockchain.asp',
 'https://www.investopedia.com/tech/blockchain-technologys-three-generations/',
 'https://www.investopedia.com/non-fungible-tokens-nft-5115211',
 'https://www.investopedia.com/terms/b/block-bitcoin-block.asp',
 'https://www.investopedia.com/terms/h/hash.asp',
 'https://www.ibm.com/topics/what-is-blockchain',
 'https://www.ibm.com/topics/what-is-blockchain#anchor-1833736260',
 'https://www.ibm.com/topics/what-is-blockchain#anchor-1286529273',
 'https://www.crunchbase.com/organization/blockchain-info',
 'https://en.wikipedia.org/wiki/Blockchain.com']

## Performing Webscraping the relevat URLs to get the text information

In [7]:
file_name = "text-data"

con = ""
for url in urls:
  result = session.get(url, headers = my_headers)
  doc = bs(result.content, "html.parser")
  contents = doc.find_all("p")
  for content in contents:
    con = con + content.text + "\n"

## Removing accented characters from the obtained doc

In [8]:
# def remove_accented_chars(text):
#     new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#     return new_text

# con = remove_accented_chars(con)

## Writing the obtained text data into a text file

In [9]:
with open('./{}.txt'.format(file_name), mode='wt', encoding='utf-8') as file:
    file.write(con)

## Checking to see if virtual GPU is running

In [10]:
# Make sure you have a GPU running
!nvidia-smi

Wed Jun  8 06:26:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Filter Warning messages 

In [11]:
import warnings
warnings.filterwarnings("ignore")

# Importing Haystack library from GitHub repo of **deepset.ai**

In [12]:
 # Install the latest release of Haystack in your own environment
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 8.8 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farm-haystack[colab]
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-install-n39jkno_/farm-haystack_72403aeecc20474ea84d4eb945ac1fd3
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-install-n39jkno_/farm-haystack_72403aeecc20474ea84d4eb945ac1fd3
  Resolved https://github.com/deepset-ai/haystack.git to commit c17969e001360ee4309211a70de37fa582b6c593
  Installing build de

# Importing the `TextConverter` library
We use the `TextConverter` library to convert the text file contents into a format that is recognised by the Haystack preprocessor. 

In [13]:
from haystack.nodes import TextConverter

converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
data = converter.convert(file_path='/content/text-data.txt', meta=None)[0]

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/
ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.
INFO - haystack.telemetry -  Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


# Importing the `PreProcessor` library
We import the `PreProcessor` library to perform necessary preprocessing tasks on the document such as splitting by words (can be sentences or paragraphs as well), cleaning whitespaces, setting split length and overlap args.

In [14]:
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(split_by = 'word', 
                            clean_whitespace = True, 
                            split_length = 200, 
                            split_overlap = 10)

preprocessed = preprocessor.process(data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Importing the `InMemoryDocumentStore`
We had first tried to implement the `ElasticSearchDocumentStore` but it was an unsuccessful attempt as Docker was not running up to standards for us. As a result, considering the relatively smaller size of our data, we have used the `InMemoryDocumentStore` instead.

In [15]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()
document_store.delete_documents()
document_store.write_documents(preprocessed)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '673c8bcec77aadb0e4eda718eda5c932' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'dc8050629e3031d3ba73af45567c7bfb' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id 'b48d3cd38b1307046df25a1369cad668' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '2073b5b3949c19e1daadba0660662a99' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '4637f34e2fb3814b71c4fa2f010ef284' already exists in index 'document'
INFO - haystack.document_stores.base -  Duplicate Documents: Document with id '755b667bb746c92d83561a7a9d033c8e' already exists in index 'document'
INFO

# Defining the Retriever
From Haystack documentations:
>The Retriever is a lightweight filter that can quickly go through the full document store and pass on a set of candidate documents that are relevant to the query. When used in combination with a Reader, it is a tool for sifting out irrelevant documents, saving the Reader from doing more work than it needs to and speeding up the querying process.

<br>

Here we have used the `DensePassageRetriever` with arguments for `query_embedding_model`, `passage_embedding_model`, boolean `True` for `use_gpu` and `embed_title`.

In [16]:
from haystack.nodes import DensePassageRetriever, EmbeddingRetriever

retriever = DensePassageRetriever(document_store=document_store, 
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base", 
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", 
                                  use_gpu=True, 
                                  embed_title=True)

# retriever = EmbeddingRetriever(document_store=document_store,
#                                embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
#                                model_format="sentence_transformers",
#                                use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-ctx_encoder-single-nq-base


# Updating Retriever Embeddings
This is one simple line of code that basically does the text vectorization in our document store as well as the query. It is used to convert large amounts of text contents into vectors, in the context of semantic search systems. <br>
This process take time because each document in the document store is passed through the transformer model.

In [17]:
# vector embedding the documents - basically count vectorizing
document_store.update_embeddings(retriever)

INFO - haystack.document_stores.memory -  Updating embeddings for 75 docs ...
Updating Embedding:   0%|          | 0/75 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/80 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:01, 7574.20 docs/s]


# Defining the `Seq2SeqGenerator`
The `Seq2SeqGenerator` is a Long Form Question Answering Generator that provides full length answers to queries, some of which actually make sense.

In [18]:
# Instantiate Long Form Answering Generator (LFQA)
from haystack.nodes import Seq2SeqGenerator

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1


Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

# Defining the `GenerativeQAPipeline` with generator and retriever.

In [19]:
from haystack.pipelines import GenerativeQAPipeline

pipeline = GenerativeQAPipeline(generator, retriever)

# Generating the answers to the questions with `top_k` for the Retriever set to 50 results.

In [20]:
prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 50}})

# Results

In [21]:
print("Q:", prediction['query'])
print("A:", prediction['answers'][0].answer)
print("\n")

Q: What is Blockchain?
A: Blockchain is a way of storing information in a way that can't be altered or destroyed. For example, let's say you have a bank account, and you want to transfer money from one bank account to another. You want to make sure that when you send the money to the other bank, it goes to the correct bank. However, you don't want the bank to know that you sent it to the wrong bank. So, you make a list of all the bank accounts that you have, and then you put them all in a database. This database is called a blockchain. When a new bank account is created, it is added to the list of bank accounts in the database, and when a new transaction is made, it's added to that list, and so on.


