In [1]:
from haystack.utils import build_pipeline, add_example_data, print_answers
from haystack.document_stores import FAISSDocumentStore
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, MarkdownConverter, DocxToTextConverter
from haystack.nodes import EmbeddingRetriever, PromptNode
from pathlib import Path
import os


In [125]:
from pydantic import BaseModel
from typing import Optional

provider = "openai"
API_KEY = os.environ.get("OPENAI_KEY")

class Settings(BaseModel):
    by: str = "word"
    length: int = 250
    language: str = "en"
    add_page: bool = True
    retrievertag: str
    top_k: int = 5
    docstore: Path
    embedding_dim: int = 768
    max_seq_length: int = 512

huggingface_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
ada_model = "text-embedding-ada-002"
settings = Settings(docstore=Path("docstore"), retrievertag=ada_model, embedding_dim=1536, max_seq_length=1536)
settings

Settings(by='word', length=500, language='en', add_page=True, retrievertag='text-embedding-ada-002', top_k=5, docstore=PosixPath('docstore'), embedding_dim=1536, max_seq_length=1536)

In [168]:
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter, FileTypeClassifier, PDFToTextConverter, MarkdownConverter, DocxToTextConverter, PreProcessor
from loguru import logger

class DocumentstoreBuilder:
    def __init__(self, settings: Settings):
        self.settings = settings

        file_type_classifier = FileTypeClassifier()
        text_converter = TextConverter()
        pdf_converter = PDFToTextConverter()
        md_converter = MarkdownConverter()
        docx_converter = DocxToTextConverter()
        preprocessor = PreProcessor(
            split_by=settings.by,
            split_length=settings.length,
            split_respect_sentence_boundary=True,
            language=settings.language,
            add_page_number=settings.add_page,
            )

        p = Pipeline()
        p.add_node(component=file_type_classifier, name="FileTypeClassifier", inputs=["File"])
        p.add_node(component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"])
        p.add_node(component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"])
        p.add_node(component=md_converter, name="MarkdownConverter", inputs=["FileTypeClassifier.output_3"])
        p.add_node(component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"])

        p.add_node(
            component=preprocessor,
            name="Preprocessor",
            inputs=["TextConverter", "PdfConverter", "MarkdownConverter", "DocxConverter"],
        )
        self.pipeline = p
        self.document_store = None
        self.retriever = None

    def run_preprocessor(self, datadir: Path):
        files = [*data_dir.glob("*")]
        logger.info(f"found {len(files)} files in {data_dir}.")
        # files = files[:2]
        # logger.info(f"reducing size to {len(files)}...")
        metadata = [{"filename" : f.name} for f in files]
        result = self.pipeline.run(file_paths=files, meta=metadata)
        logger.info(f"retrieved {len(result['documents'])} document snippets.")
        return result

    def add_files(self, datadir: Path, tag: str):
        result = self.run_preprocessor(datadir)

        if not self.document_store:
            self.get_docstore(tag)
        self.document_store.write_documents(documents=result["documents"])

        if not self.retriever:
            self.get_retriever(modeltag=self.settings.retrievertag, top_k=self.settings.top_k)
        logger.info("updating embeddings...")
        self.document_store.update_embeddings(retriever=self.retriever, update_existing_embeddings=False, batch_size=64)
        logger.info("saving docstore...")
        _, index_path, config_path = self._get_paths(tag)
        self.document_store.save(index_path=index_path, config_path=config_path)

    def _get_paths(self, tag: str):
        docstore = self.settings.docstore
        sql_url = f"sqlite:///{docstore.name}/{tag}.db"
        index_path = docstore / f"{tag}.faiss"
        config_path = docstore / f"{tag}.json"
        return sql_url, index_path, config_path


    def get_docstore(self, tag: str):
        docstore = self.settings.docstore
        if not docstore.exists():
            logger.info(f"creating docstorefolder at {docstore}")
            docstore.mkdir()

        sql_url, index_path, config_path = self._get_paths(tag)

        if not index_path.exists():
            logger.info(f"creating FAISS docstore {sql_url}")
            self.document_store = FAISSDocumentStore(
                sql_url=sql_url,
                faiss_index_factory_str="Flat",
                embedding_dim=self.settings.embedding_dim
                )
        else:
            logger.info(f"loading FAISS docstore with {index_path}")
            self.document_store = FAISSDocumentStore.load(
                index_path=index_path,
                config_path=config_path
                )

        logger.info(f"docstore has {self.document_store.get_document_count()} docs.")

    def get_retriever(self, modeltag: str, top_k: int):
        API_KEY = os.environ.get("OPENAI_KEY", None)

        self.retriever = EmbeddingRetriever(
            document_store=self.document_store,
            embedding_model=modeltag,
            batch_size=8,
            api_key=API_KEY,
            top_k=top_k,
            max_seq_len=self.settings.max_seq_length,
            )


In [169]:
builder = DocumentstoreBuilder(settings)

In [170]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [171]:
# data_dir = Path.home() / "code/arxiv/data/pdfs"
data_dir = Path.home() / "Downloads/research/research"

builder.add_files(data_dir, "scepa")

[32m2023-12-08 11:19:26.776[0m | [1mINFO    [0m | [36m__main__[0m:[36madd_files[0m:[36m40[0m - [1mfound 227 files in /Users/rgrouls/Downloads/research/research.[0m
Converting files: 100%|██████████| 227/227 [21:28<00:00,  5.68s/it]
Preprocessing:   0%|          | 0/227 [00:00<?, ?docs/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing:  32%|███▏      | 72/227 [00:02<00:04, 34.49docs/s]Document 54e55cc6914848d84856a060838cf15f is 62437 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Document f8a5f1a8d4ae69bf1020ff40599616cf is 52437 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document wil

In [None]:
builder.retriever.document_store = builder.document_store

In [172]:
len(builder.document_store.get_all_documents())

6196

In [173]:
docs = builder.document_store.get_all_documents()
for d in docs[:5]:
    print(f"text: {d.content[:100]} \n meta: {d.meta} \n =====================")

text: In
addition, the M indicator is used to determine whether
a household is in a situation of energy vu 
 meta: {'vector_id': '0', 'filename': 'Gallego S\xa0nchez-2022-Energy solvency. A new co.pdf', '_split_id': 2, 'page': 2} 
text: The "current na-
tional planning guidance" (PPS 22) and Energy Crops Scheme required
feedstocks to b 
 meta: {'vector_id': '1', 'filename': '1-s2.0-S0301421518304853-main.pdf', '_split_id': 2, 'page': 2} 
text: These results indicate that, while it is possible to identify
high-performing policies for overcomin 
 meta: {'vector_id': '2', 'filename': 'Miu-2018-A Simple Assessment of Housing Retrof.pdf', '_split_id': 21, 'page': 16} 
text: 20
1227
Table 3: Standard Electricity Bills, 2009-1055

1
2
3
4
5
6
7
8
9
10 Mean
6488
10884
13844
 
 meta: {'vector_id': '3', 'filename': 'cwpe1256.pdf', '_split_id': 15, 'page': 22} 
text: A decent home was defined as one satisfying all four
criteria, i.e. meeting the fitness standard for 
 meta: {'vector_id': '4', 'f

In [235]:
questions = [
        "What are proven interventions to reduce / alleviate energy poverty?",
        "Which intervention to reduce / alleviate energy poverty are effective?",
        "In which contexts do interventions for energy poverty reduction work?",
        "Which contextual factors are relevant for the success of interventions for reducting energy poverty?",
        "For which specific target groups do interventions for energy poverty reduction work?",
        "On which mechanisms are interventions for energy poverty reduction based?",
        "Which indicators / (outcome) measures / outcomes can be used to measure the success of interventions for energy poverty reduction?"
        "Which specific groups are vulnerable to energy poverty?",
        "What are the characteristics of people at risk for energy poverty?",
        "How can people at risk for energy poverty be identified / located?",
        "How can target groups / hard-to-reach groups be engaged / involved in energy poverty programmes?",
        "Which communication strategies can be used to involve target groups / hard-to-reach groups in energy poverty reduction?",
        "What are the causes of energy poverty?",
        "What are effective strategies / programmes to combat / reduce energy poverty?",
    ]


In [202]:
for q in questions:
    context = builder.retriever.retrieve(document_store=builder.document_store, query=q, top_k=5)
    print(f"question: {q}")
    for doc in context:
        print(doc.meta["filename"])
    print("=====================================")


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.96it/s]


question: What are proven interventions to reduce / alleviate energy poverty?
Chien-2022-Assessing the impact of green fisca.pdf
1-s2.0-S0301421523002690-main.pdf
Breukers-2021-Review of EU and national policy.pdf
1-s2.0-S0301421523002690-main.pdf
1-s2.0-S0360544221018053-main.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]


question: Which intervention to reduce / alleviate energy poverty are effective?
Chien-2022-Assessing the impact of green fisca.pdf
Breukers-2021-Review of EU and national policy.pdf
1-s2.0-S0301421517308650-main.pdf
Carrere-2022-Effectiveness of an Energy-Counse.pdf
1-s2.0-S0360544221018053-main.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.45it/s]


question: In which contexts do interventions for energy poverty reduction work?
1-s2.0-S0301421523002690-main.pdf
1-s2.0-S0301421523002690-main.pdf
Chien-2022-Assessing the impact of green fisca.pdf
1-s2.0-S0301421523002690-main.pdf
1-s2.0-S0301421522002312-main.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.46it/s]


question: Which contextual factors are relevant for the success of interventions for reducting energy poverty?
Halkos-2021-Coping with Energy Poverty_ Measur.pdf
1-s2.0-S0301421519300734-main.pdf
1-s2.0-S0301421519300734-main.pdf
1-s2.0-S0301421522002312-main.pdf
Breukers-2021-Review of EU and national policy.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s]


question: For which specific target groups do interventions for energy poverty reduction work?
Energy poverty alleviation effective policies.pdf
JET_februar_2014-koncna.pdf
1-s2.0-S0360544221018399-main.pdf
Chien-2022-Assessing the impact of green fisca.pdf
Kanellou-2023-Lessons Learnt and Policy Implic.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]


question: On which mechanisms are interventions for energy poverty reduction based?
1-s2.0-S0301421523002690-main.pdf
1-s2.0-S0301421523002690-main.pdf
Natural Resources Forum - 2003 - Gururaja - En.pdf
ICS_AHorta_EuropeanEnergy_WG4-Case-study.pdf
Breukers-2021-Review of EU and national policy.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]


question: Which indicators / (outcome) measures / outcomes can be used to measure the success of interventions for energy poverty reduction?Which specific groups are vulnerable to energy poverty?
1-s2.0-S0360544221018053-main.pdf
1-s2.0-S0140988321006290-main.pdf
Bouzarovski-2021-Confronting Energy Poverty in.pdf
485163.pdf
1-s2.0-S0301421517308789-main.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]


question: What are the characteristics of people at risk for energy poverty?
2GENDERS_FinReport.pdf
20180523_These_AudreyBerry.pdf
2GENDERS_FinReport.pdf
2GENDERS_FinReport.pdf
1-s2.0-S0140988321003777-main.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s]


question: How can people at risk for energy poverty be identified / located?
2GENDERS_FinReport.pdf
1-s2.0-S0301421517302227-main.pdf
JET_februar_2014-koncna.pdf
978-3-319-69299-9.pdf
1-s2.0-S0301421523001647-main.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


question: How can target groups / hard-to-reach groups be engaged / involved in energy poverty programmes?
JET_februar_2014-koncna.pdf
Breukers-2021-Review of EU and national policy.pdf
Kanellou-2023-Lessons Learnt and Policy Implic.pdf
JET_februar_2014-koncna.pdf
Kanellou-2023-Lessons Learnt and Policy Implic.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.47it/s]


question: Which communication strategies can be used to involve target groups / hard-to-reach groups in energy poverty reduction?
Kanellou-2023-Lessons Learnt and Policy Implic.pdf
Kanellou-2023-Lessons Learnt and Policy Implic.pdf
Kanellou-2023-Lessons Learnt and Policy Implic.pdf
Breukers-2021-Review of EU and national policy.pdf
Clodnitchi_2017_Energy_poverty_in_Romania_dri.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]


question: What are the causes of energy poverty?
2GENDERS_FinReport.pdf
Neacsa-2020-Energy Poverty in European Union_.pdf
1-s2.0-S0140988321003777-main.pdf
MPRA_paper_111061.pdf
Katsoulakos-2011-Combating Energy Poverty in M.pdf


Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.41it/s]

question: What are effective strategies / programmes to combat / reduce energy poverty?
Chien-2022-Assessing the impact of green fisca.pdf
1-s2.0-S0301421517308650-main.pdf
Neacsa-2020-Energy Poverty in European Union_.pdf
Breukers-2021-Review of EU and national policy.pdf
Longo-2020-Energy Poverty and Protection of Vu.pdf





In [None]:
temp3 = """Create a concise and informative answer (no more than 50 words) for a given question
based solely on the given documents. You must only use information from the given documents.
Use an unbiased and journalistic tone. Do not repeat text. Cite the documents using Document[number] notation.
If multiple documents contain the answer, cite those documents like ‘as stated in Document[number], Document[number], etc.’.
If the documents do not contain the answer to the question, say that ‘answering is not possible given the available information.’
{join(documents, delimiter=new_line, pattern=new_line+'Document[$idx]: $content', str_replace={new_line: ' ', '[': '(', ']': ')'})}
Question: {query}; Answer:
"""

In [None]:
s = """Given the context please answer the question. Context: {join(documents)};
Question: {query};
Answer:
"""

In [248]:
temp1 = "deepset/question-answering"
temp2 = "deepset/question-answering-with-references"
prompt_node = PromptNode(model_name_or_path = "gpt-3.5-turbo",
                         api_key = API_KEY,
                         max_length = 256,
                         default_prompt_template = temp2)

In [249]:
query_pipeline = Pipeline()
query_pipeline.add_node(component=builder.retriever, name="Retriever", inputs=["Query"])
query_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])


In [250]:
i = 4
questions[i]

'For which specific target groups do interventions for energy poverty reduction work?'

In [251]:
answer = query_pipeline.run(query = questions[i])

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.30it/s]


In [244]:
len(answer["results"][0].split())

62

In [252]:
print(answer["results"][0].replace(".", ".\n").replace(",", ",\n"))

Interventions for energy poverty reduction target households that have difficulties affording basic energy needs.
 Other target groups include local authorities,
 owners of buildings with fuel poverty conditions,
 various local actors,
 and individuals with basic skills to give energy advice to low-income households.
 (Document[2])


In [246]:
answer["invocation_context"]["documents"]

[<Document: {'content': 'Households that suffer from multiple types of energy deprivation are likely\nto be in a worse situation than households affected by only one form of deprivation.\nPolicies for energy poverty alleviation should be specifically designed to target different subgroups\nof the population with respect to their needs and living conditions as shown by the respective\ncontributions. Multi-stakeholder platforms seem suitable for policy dialogue and inter-ministerial\ncollaboration to mainstream the integration of climate and social policy in strategies, programs, and\nbudgeting.\nAcknowledgments\nThe Editors would like to express their sincere thanks and gratitude to the authors, who submitted papers to\nthis Special Issue and, especially, the referees, who spent their valuable time on providing their detailed\nreviews. Without their help, it would be impossible to prepare this Special Issue in line with the high\nstandards set from the beginning.\nFinally, the Editors w

In [247]:
for doc in answer["invocation_context"]["documents"]:
    print(doc.meta)

{'filename': 'Energy poverty alleviation effective policies.pdf', '_split_id': 3, 'page': 4, 'vector_id': '5031'}
{'filename': 'JET_februar_2014-koncna.pdf', '_split_id': 47, 'page': 72, 'vector_id': '3447'}
{'filename': '1-s2.0-S0360544221018399-main.pdf', '_split_id': 19, 'page': 11, 'vector_id': '5402'}
{'filename': 'Chien-2022-Assessing the impact of green fisca.pdf', '_split_id': 15, 'page': 10, 'vector_id': '3019'}
{'filename': 'Kanellou-2023-Lessons Learnt and Policy Implic.pdf', '_split_id': 1, 'page': 1, 'vector_id': '5855'}
