# Notebook [2]: Using the PDF converter



This notebook shows how to use the PDF converter to create an input dataframe for the cdQA pipeline from a directory of PDF files.


***Note:*** *To run this notebook you will need to have access to GPU. If you are using colab, you will need to install `cdQA` by executing `!pip install cdqa` in a cell.* 

In [9]:
import os
import pandas as pd
from ast import literal_eval

In [5]:
!pip install tika

Collecting tika
  Downloading https://files.pythonhosted.org/packages/96/07/244fbb9c74c0de8a3745cc9f3f496077a29f6418c7cbd90d68fd799574cb/tika-1.24.tar.gz
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-1.24-cp36-none-any.whl size=32884 sha256=8c12b4ec5f7b7816dd43478e4be217af0a73b96224aac50e1ec61d7cac97f474
  Stored in directory: /root/.cache/pip/wheels/73/9c/f5/0b1b738442fc2a2862bef95b908b374f8e80215550fb2a8975
Successfully built tika
Installing collected packages: tika
Successfully installed tika-1.24


In [10]:
import json
import os
import re
import sys
from tqdm import tqdm
from tika import parser
import pandas as pd
import uuid
import markdown
from pathlib import Path
from html.parser import HTMLParser


def df2squad(df, squad_version="v1.1", output_dir=None, filename=None):
    """
     Converts a pandas dataframe with columns ['title', 'paragraphs'] to a json file with SQuAD format.
     Parameters
    ----------
     df : pandas.DataFrame
         a pandas dataframe with columns ['title', 'paragraphs']
     squad_version : str, optional
         the SQuAD dataset version format (the default is 'v2.0')
     output_dir : str, optional
         Enable export of output (the default is None)
     filename : str, optional
         [description]
    Returns
    -------
    json_data: dict
        A json object with SQuAD format
     Examples
     --------
     >>> from ast import literal_eval
     >>> import pandas as pd
     >>> from cdqa.utils.converters import df2squad
     >>> from cdqa.utils.filters import filter_paragraphs
     >>> df = pd.read_csv('../data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})
     >>> df['paragraphs'] = df['paragraphs'].apply(filter_paragraphs)
     >>> json_data = df2squad(df=df, squad_version='v1.1', output_dir='../data', filename='bnpp_newsroom-v1.1')
    """

    json_data = {}
    json_data["version"] = squad_version
    json_data["data"] = []

    for idx, row in tqdm(df.iterrows()):
        temp = {"title": row["title"], "paragraphs": []}
        for paragraph in row["paragraphs"]:
            temp["paragraphs"].append({"context": paragraph, "qas": []})
        json_data["data"].append(temp)

    if output_dir:
        with open(os.path.join(output_dir, "{}.json".format(filename)), "w") as outfile:
            json.dump(json_data, outfile)

    return json_data


def generate_squad_examples(question, best_idx_scores, metadata, retrieve_by_doc):
    """
    Creates a SQuAD examples json object for a given question using outputs of retriever and document database.
    Parameters
    ----------
    question : [type]
        [description]
    best_idx_scores : [type]
        [description]
    metadata : [type]
        [description]
    Returns
    -------
    squad_examples: list
        [description]
    Examples
    --------
    >>> from cdqa.utils.converters import generate_squad_examples
    >>> squad_examples = generate_squad_examples(question='Since when does the the Excellence Program of BNP Paribas exist?',
                                         best_idx_scores=[(788, 1.2), (408, 0.4), (2419, 0.2)],
                                         metadata=df)
    """

    squad_examples = []

    metadata_sliced = metadata.loc[best_idx_scores.keys()]

    for idx, row in metadata_sliced.iterrows():
        temp = {"title": row["title"], "paragraphs": []}

        if retrieve_by_doc:
            for paragraph in row["paragraphs"]:
                temp["paragraphs"].append(
                    {
                        "context": paragraph,
                        "qas": [
                            {
                                "answers": [],
                                "question": question,
                                "id": str(uuid.uuid4()),
                                "retriever_score": best_idx_scores[idx],
                            }
                        ],
                    }
                )
        else:
            temp["paragraphs"] = [
                {
                    "context": row["content"],
                    "qas": [
                        {
                            "answers": [],
                            "question": question,
                            "id": str(uuid.uuid4()),
                            "retriever_score": best_idx_scores[idx],
                        }
                    ],
                }
            ]

        squad_examples.append(temp)

    return squad_examples


def pdf_converter(directory_path, min_length=200, include_line_breaks=False):
    """
    Function to convert PDFs to Dataframe with columns as title & paragraphs.
    Parameters
    ----------
    min_length : integer
        Minimum character length to be considered as a single paragraph
    include_line_breaks: bool
        To concatenate paragraphs less than min_length to a single paragraph
    Returns
    -------------
    df : Dataframe
    Description
    -----------------
    If include_line_breaks is set to True, paragraphs with character length
    less than min_length (minimum character length of a paragraph) will be
    considered as a line. Lines before or after each paragraph(length greater
    than or equal to min_length) will be concatenated to a single paragraph to
    form the list of paragraphs in Dataframe.
    Else paragraphs are appended directly to form the list.
    """
    list_file = os.listdir(directory_path)
    list_pdf = []
    for file in list_file:
        if file.endswith("pdf"):
            list_pdf.append(file)
    df = pd.DataFrame(columns=["title", "paragraphs"])
    for i, pdf in enumerate(list_pdf):
        try:
            df.loc[i] = [pdf.replace(".pdf",''), None]
            raw = parser.from_file(os.path.join(directory_path, pdf))
            s = raw["content"].strip()
            paragraphs = re.split("\n\n(?=\u2028|[A-Z-0-9])", s)
            list_par = []
            temp_para = ""  # variable that stores paragraphs with length<min_length
            # (considered as a line)
            for p in paragraphs:
                if not p.isspace():  # checking if paragraph is not only spaces
                    if include_line_breaks:  # if True, check length of paragraph
                        if len(p) >= min_length:
                            if temp_para:
                                # if True, append temp_para which holds concatenated
                                # lines to form a paragraph before current paragraph p
                                list_par.append(temp_para.strip())
                                temp_para = (
                                    ""
                                )  # reset temp_para for new lines to be concatenated
                                list_par.append(
                                    p.replace("\n", "")
                                )  # append current paragraph with length>min_length
                            else:
                                list_par.append(p.replace("\n", ""))
                        else:
                            # paragraph p (line) is concatenated to temp_para
                            line = p.replace("\n", " ").strip()
                            temp_para = temp_para + f" {line}"
                    else:
                        # appending paragraph p as is to list_par
                        list_par.append(p.replace("\n", ""))
                else:
                    if temp_para:
                        list_par.append(temp_para.strip())

            df.loc[i, "paragraphs"] = list_par
        except:
            print("Unexpected error:", sys.exc_info()[0])
            print("Unable to process file {}".format(pdf))
    return df


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return "".join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def md_converter(directory_path):
    """Get all md, convert them to html and create the pandas dataframe with columns ['title', 'paragraphs']"""
    dict_doc = {"title": [], "paragraphs": []}
    for md_file in Path(directory_path).glob("**/*.md"):
        md_file = str(md_file)
        filename = md_file.split("/")[-1]
        try:
            with open(md_file, "r") as f:
                dict_doc["title"].append(filename)
                md_text = f.read()
                html_text = markdown.markdown(md_text)
                html_text_list = list(html_text.split("<p>"))
                for i in range(len(html_text_list)):
                    html_text_list[i] = (
                        strip_tags(html_text_list[i])
                        .replace("\n", " ")
                        .lstrip()
                        .rstrip()
                    )
                clean_text_list = list(filter(None, html_text_list))
                dict_doc["paragraphs"].append(clean_text_list)
        except:
            print("Unexpected error:", sys.exc_info()[0])
            print("Unable to process file {}".format(filename))
    df = pd.DataFrame.from_dict(dict_doc)
    return df


In [12]:
#from cdqa.utils.filters import filter_paragraphs
  
import os
import pandas as pd
import numpy as np


def filter_paragraphs(
    articles,
    drop_empty=True,
    read_threshold=1000,
    public_data=True,
    min_length=50,
    max_length=300,
):
    """
    Cleans the paragraphs and filters them regarding their length
    Parameters
    ----------
    articles : DataFrame of all the articles 
    Returns
    -------
    Cleaned and filtered dataframe
    Examples
    --------
    >>> import pandas as pd
    >>> from cdqa.utils.filters import filter_paragraphs
    >>> df = pd.read_csv('data.csv')
    >>> df_cleaned = filter_paragraphs(df)
    """

    # Replace and split
    def replace_and_split(paragraphs):
        for paragraph in paragraphs:
            paragraph.replace("'s", " " "s").replace("\\n", "").split("'")
        return paragraphs

    # Select paragraphs with the required size
    def filter_on_size(paragraphs, min_length=min_length, max_length=max_length):
        paragraph_filtered = [
            paragraph.strip()
            for paragraph in paragraphs
            if len(paragraph.split()) >= min_length
            and len(paragraph.split()) <= max_length
        ]
        return paragraph_filtered

    # Cleaning and filtering
    articles["paragraphs"] = articles["paragraphs"].apply(replace_and_split)
    articles["paragraphs"] = articles["paragraphs"].apply(filter_on_size)
    articles["paragraphs"] = articles["paragraphs"].apply(
        lambda x: x if len(x) > 0 else np.nan
    )

    # Read threshold for private dataset
    if not public_data:
        articles = articles.loc[articles["number_of_read"] >= read_threshold]

    # Drop empty articles
    if drop_empty:
        articles = articles.dropna(subset=["paragraphs"]).reset_index(drop=True)

    return articles

In [23]:
!pip install cdqa


Collecting cdqa
[?25l  Downloading https://files.pythonhosted.org/packages/39/f5/af831b7ee653aa6bace99e39ec6b2754b1adb10bb60a1296f5e16f1f24ee/cdqa-1.3.9.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 4.0MB/s 
[?25hCollecting Flask==1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/9b/93/628509b8d5dc749656a9641f4caf13540e2cdec85276964ff8f43bbb1d3b/Flask-1.1.1-py2.py3-none-any.whl (94kB)
[K     |████████████████████████████████| 102kB 6.4MB/s 
[?25hCollecting flask_cors==3.0.8
  Downloading https://files.pythonhosted.org/packages/78/38/e68b11daa5d613e3a91e4bf3da76c94ac9ee0d9cd515af9c1ab80d36f709/Flask_Cors-3.0.8-py2.py3-none-any.whl
Collecting joblib==0.13.2
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl (278kB)
[K     |████████████████████████████████| 286kB 8.5MB/s 
[?25hCollecting pandas==0.25.0
[?25l  Downloading https://files.python

In [13]:
import joblib
import warnings

import pandas as pd
import numpy as np
import torch

from sklearn.base import BaseEstimator

from cdqa.retriever import TfidfRetriever, BM25Retriever
from cdqa.utils.converters import generate_squad_examples
from cdqa.reader import BertProcessor, BertQA

RETRIEVERS = {"bm25": BM25Retriever, "tfidf": TfidfRetriever}


class QAPipeline(BaseEstimator):
    """
    A scikit-learn implementation of the whole cdQA pipeline
    Parameters
    ----------
    reader: str (path to .joblib) or .joblib object of an instance of BertQA (BERT model with sklearn wrapper), optional
    retriever: "bm25" or "tfidf"
        The type of retriever
    retrieve_by_doc: bool (default: True). If Retriever will rank by documents
        or by paragraphs.
    kwargs: kwargs for BertQA(), BertProcessor(), TfidfRetriever() and BM25Retriever()
        Please check documentation for these classes
    Examples
    --------
    >>> from cdqa.pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline(reader='bert_qa_squad_vCPU-sklearn.joblib')
    >>> qa_pipeline.fit_retriever(df=df)
    >>> prediction = qa_pipeline.predict(query='When BNP Paribas was created?')
    >>> from cdqa.pipeline import QAPipeline
    >>> qa_pipeline = QAPipeline()
    >>> qa_pipeline.fit_reader('train-v1.1.json')
    >>> qa_pipeline.fit_retriever(df=df)
    >>> prediction = qa_pipeline.predict(X='When BNP Paribas was created?')
    """

    def __init__(self, reader=None, retriever="bm25", retrieve_by_doc=False, **kwargs):

        if retriever not in RETRIEVERS:
            raise ValueError(
                "You provided a type of retriever that is not supported. "
                + "Please provide a retriver in the following list: "
                + str(list(RETRIEVERS.keys()))
            )

        retriever_class = RETRIEVERS[retriever]

        # Separating kwargs
        kwargs_bertqa = {
            key: value
            for key, value in kwargs.items()
            if key in BertQA.__init__.__code__.co_varnames
        }

        kwargs_processor = {
            key: value
            for key, value in kwargs.items()
            if key in BertProcessor.__init__.__code__.co_varnames
        }

        kwargs_retriever = {
            key: value
            for key, value in kwargs.items()
            if key in retriever_class.__init__.__code__.co_varnames
        }

        if not reader:
            self.reader = BertQA(**kwargs_bertqa)
        elif type(reader) == str:
            self.reader = joblib.load(reader)
        else:
            self.reader = reader

        self.processor_train = BertProcessor(is_training=True, **kwargs_processor)

        self.processor_predict = BertProcessor(is_training=False, **kwargs_processor)

        self.retriever = retriever_class(**kwargs_retriever)

        self.retrieve_by_doc = retrieve_by_doc

        if torch.cuda.is_available():
            self.cuda()

    def fit_retriever(self, df: pd.DataFrame = None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.
        Parameters
        ----------
        df: pandas.Dataframe
            Dataframe with the following columns: "title", "paragraphs"
        """

        if self.retrieve_by_doc:
            self.metadata = df
            self.metadata["content"] = self.metadata["paragraphs"].apply(
                lambda x: " ".join(x)
            )
        else:
            self.metadata = self._expand_paragraphs(df)

        self.retriever.fit(self.metadata)

        return self

    def fit_reader(self, data=None):
        """ Fit the QAPipeline retriever to a list of documents in a dataframe.
        Parameters
        ----------
        data: dict str-path to json file
             Annotated dataset in squad-like for Reader training
        """

        train_examples, train_features = self.processor_train.fit_transform(data)
        self.reader.fit(X=(train_examples, train_features))

        return self

    def predict(
        self,
        query: str = None,
        n_predictions: int = None,
        retriever_score_weight: float = 0.35,
        return_all_preds: bool = False,
    ):
        """ Compute prediction of an answer to a question
        Parameters
        ----------
        query: str
            Sample (question) to perform a prediction on
        n_predictions: int or None (default: None).
            Number of returned predictions. If None, only one prediction is return
        retriever_score_weight: float (default: 0.35).
            The weight of retriever score in the final score used for prediction.
            Given retriever score and reader average of start and end logits, the final score used for ranking is:
            final_score = retriever_score_weight * retriever_score + (1 - retriever_score_weight) * (reader_avg_logit)
        return_all_preds: boolean (default: False)
            whether to return a list of all predictions done by the Reader or not
        Returns
        -------
        if return_all_preds is False:
        prediction: tuple (answer, title, paragraph, score/logit)
        if return_all_preds is True:
        List of dictionnaries with all metadada of all answers outputted by the Reader
        given the question.
        """

        if not isinstance(query, str):
            raise TypeError(
                "The input is not a string. Please provide a string as input."
            )
        if not (
            isinstance(n_predictions, int) or n_predictions is None or n_predictions < 1
        ):
            raise TypeError("n_predictions should be a positive Integer or None")
        best_idx_scores = self.retriever.predict(query)
        squad_examples = generate_squad_examples(
            question=query,
            best_idx_scores=best_idx_scores,
            metadata=self.metadata,
            retrieve_by_doc=self.retrieve_by_doc,
        )
        examples, features = self.processor_predict.fit_transform(X=squad_examples)
        prediction = self.reader.predict(
            X=(examples, features),
            n_predictions=n_predictions,
            retriever_score_weight=retriever_score_weight,
            return_all_preds=return_all_preds,
        )
        return prediction

    def to(self, device):
        """ Send reader to CPU if device=='cpu' or to GPU if device=='cuda'
        """
        if device not in ("cpu", "cuda"):
            raise ValueError("Attribute device should be 'cpu' or 'cuda'.")

        self.reader.model.to(device)
        self.reader.device = torch.device(device)
        return self

    def cpu(self):
        """ Send reader to CPU
        """
        self.reader.model.cpu()
        self.reader.device = torch.device("cpu")
        return self

    def cuda(self):
        """ Send reader to GPU
        """
        self.reader.model.cuda()
        self.reader.device = torch.device("cuda")
        return self

    def dump_reader(self, filename):
        """ Dump reader model to a .joblib object
        """
        self.cpu()
        joblib.dump(self.reader, filename)
        if torch.cuda.is_available():
            self.cuda()

    @staticmethod
    def _expand_paragraphs(df):
        # Snippet taken from: https://stackoverflow.com/a/48532692/11514226
        lst_col = "paragraphs"
        df = pd.DataFrame(
            {
                col: np.repeat(df[col].values, df[lst_col].str.len())
                for col in df.columns.drop(lst_col)
            }
        ).assign(**{lst_col: np.concatenate(df[lst_col].values)})[df.columns]
        df["content"] = df["paragraphs"]
        return df.drop("paragraphs", axis=1)

In [14]:
import os
import pandas as pd
from ast import literal_eval



from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

### Download pre-trained reader model and PDF files

In [15]:
# Download model
download_model(model='bert-squad_1.1', dir='./models')


Downloading trained model...
bert_qa.joblib already downloaded


In [16]:
# Download pdf files from BNP Paribas public news
def download_pdf():
    import os
    import wget
    directory = './data/pdf/'
    models_url = [
      'https://sbi.co.in/documents/17826/35696/23062020_SBI+AR+2019-20+%28Time+16_3b11%29.pdf/a358b5ec-1d32-a093-d9ac-13071fda9ff6?t=1592911831224',
      'https://sbi.co.in/documents/17826/24027/2007201345-SBI+Sustainability+Report+V37+20_07_2020_Spread_layout.pdf/801cc0de-a47d-c860-f5c3-fc57efb58339?t=1595232977158'
      
    ]

    print('\nDownloading PDF files...')

    if not os.path.exists(directory):
        os.makedirs(directory)
    for url in models_url:
        wget.download(url=url, out=directory)

download_pdf()


Downloading PDF files...


### Convert the PDF files into a DataFrame for cdQA pipeline

In [38]:
df = pdf_converter(directory_path='./data/pdf/')
df.head()

Unnamed: 0,title,paragraphs
0,23062020_SBI AR 2019-20 (Time 16_3b11),"[ANNUAL REPORT 2019-20, STATE BANK OF INDIA, E..."


### Instantiate the cdQA pipeline from a pre-trained reader model

In [20]:
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

100%|██████████| 231508/231508 [00:00<00:00, 1161595.60B/s]


QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po..._size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=1.0, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

 ### Execute a query

In [43]:
query = 'The number of board meetings during the year 2019?'
prediction = cdqa_pipeline.predict(query)

### Explore predictions

In [44]:
print('query: {}'.format(query))
print('answer: {}'.format(prediction[0]))
print('title: {}'.format(prediction[1]))
print('paragraph: {}'.format(prediction[2]))

query: The number of board meetings during the year 2019?
answer: sixteen
title: 23062020_SBI AR 2019-20 (Time 16_3b11)
paragraph: MEETINGS OF THE CENTRAL BOARDThe Bank’s Central Board has to meet a minimum of six times in a year. During the year 2019-20, sixteen Central Board Meetings were held. The dates of the meetings and attendance of the directors are as under:
