In [2]:
# Packages
import os
import yfinance as yf
from haystack.nodes import LinkContentFetcher
from haystack.document_stores import InMemoryDocumentStore, FAISSDocumentStore
from haystack.nodes import  BM25Retriever, AnswerParser, DensePassageRetriever, EmbeddingRetriever
from haystack.nodes.prompt import PromptTemplate
from haystack.nodes import PromptNode
from haystack import Pipeline
from haystack.utils import print_answers

In [3]:
# Model configurations
hfAPIKey = os.getenv("HF_API_KEY")
hfModelName = os.getenv("HF_MODEL_NAME")

In [4]:
gold = yf.Ticker("GC=F")
news = gold.news

In [5]:
links = []

for new in news:
    links.append(new["link"])

In [6]:
lcf = LinkContentFetcher()
docs = []

for link in links:
    docs.append(lcf.fetch(link)[0])

In [10]:
documentStore = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)


documentStore.delete_documents()
documentStore.write_documents(docs)

retriever = DensePassageRetriever(
                document_store=documentStore,
                query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                use_gpu=True,
                embed_title=True,
    )


documentStore.update_embeddings(retriever)

documentStore.save("../../../data/faiss/my_faiss")


Writing Documents: 10000it [00:00, 147544.93it/s]       
Documents Processed: 10000 docs [00:01, 6646.13 docs/s]      


In [12]:
documentStore = FAISSDocumentStore.load(index_path="../../../data/faiss/my_faiss")
documents = documentStore.get_all_documents(return_embedding=True)
print(documents[0].embedding)

[ 6.63063377e-02  5.22257268e-01  7.02151179e-01 -5.03817022e-01
  3.49390984e-01 -2.78370548e-02  2.21440166e-01  1.75856397e-01
 -3.04133445e-01  1.99300021e-01 -2.51114815e-01  2.78826207e-01
  6.62059426e-01  8.13667893e-01  1.00574873e-01  6.16790771e-01
  2.81263620e-01  3.78716439e-01  4.76274937e-01 -4.84865159e-01
  2.90044606e-01 -1.73409820e-01  8.12173903e-01  1.87692359e-01
  3.45369786e-01  3.12105641e-02  4.11602467e-01 -7.72876218e-02
 -2.66397178e-01 -3.85615498e-01 -9.25543904e-02  1.98594823e-01
 -1.75717980e-01 -2.39126235e-01  3.70084882e-01 -3.62987936e-01
  8.63675177e-02 -4.77144808e-01 -3.22247922e-01 -5.29816687e-01
 -1.57454252e-01 -8.55210274e-02 -2.35240050e-02  3.27875048e-01
 -7.93446600e-01  9.00815576e-02 -8.69091392e-01 -7.58011863e-02
 -4.81734574e-01 -4.16152298e-01 -2.23581642e-01  1.50359452e-01
 -4.52488102e-03 -2.36205906e-01  2.40622520e-01  3.91765475e-01
 -5.30598044e-01 -3.11030328e-01  3.01765949e-01 -4.29286838e-01
  2.91264504e-01  3.39383

In [19]:
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents
relevantDocuments = DocumentSearchPipeline(retriever)
res = relevantDocuments.run(query="Tell me something Gold's market performance", params={"Retriever": {"top_k": 3}})
print_documents(res, max_text_len=512)


Query: Tell me something Gold's market performance

{   'content': 'Geopolitical Tensions Fuel a Rally in Gold\n'
               'Read full article\n'
               'October 16, 2023 at 10:29 AM\n'
               '·1 min read\n'
               'Geopolitical Tensions Fuel a Rally in Gold\n'
               'Gold is hot again. Weeks ago, [investors were shunning '
               'gold](https://www.wsj.com/livecoverage/stock-market-today-dow-jones-10-02-2023/card/gold-prices-on-pace-to-settle-at-the-lowest-level-since-march-AUS3byLdSvOrYEOrnYdD) '
               'because of worries that a strong economy could induce the '
               'Federal Reserve to further raise interest rates this year and '
               'keep them steady in 2024...',
    'name': None}

{   'content': 'MORNING BID ASIA-Stocks break higher, but for longer too?\n'
               'Read full article\n'
               'October 16, 2023 at 2:45 PM\n'
               '·3 min read\n'
               'By Jamie McGeever\n

In [14]:
promptTemplate = PromptTemplate(
    prompt="""
    Answer the question truthfully based solely on the given documents. If the documents do not contain the answer to the question, say that answering is not possible given the available information. Your answer should be no longer than 100 words.
    Question:{query}
    Documents :{join(documents)}
    Answer:
    """,
    output_parser=AnswerParser(),
)

# Prompt Node initialization
promptNode = PromptNode(hfModelName,api_key=hfAPIKey,default_prompt_template = promptTemplate)

In [15]:
# Pipeline
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
pipeline.add_node(component=promptNode, name="promptNode", inputs=["retriever"])

In [18]:
output = pipeline.run(query="What is the news on King Charles on gold mining",documents=res)
print_answers(output, details="minimum")

Token indices sequence length is longer than the specified maximum sequence length for this model (11287 > 1024). Running this sequence through the model will result in indexing errors
The prompt has been truncated from 11287 tokens to 924 tokens so that the prompt length and answer length (100 tokens) fit within the max token limit (1024 tokens). Shorten the prompt to prevent it from being cut off.


'Query: What is the news on King Charles on gold mining'
'Answers:'
[   {   'answer': ' estate planning.\n'
                  '19h ago\n'
                  'The 10 Best Stocks to Buy in 2023\n'
                  'The stock market has been a roller coaster ride in 2022, '
                  'but there are still plenty of opportunities for investors.\n'
                  '19h ago\n'
                  'The 10 Best Stocks to Buy in 2023\n'
                  'The stock market has been a roller coaster ride in 2022, '
                  'but there are still plenty of opportunities'}]
