In [6]:
import pandas as pd
import os
import streamlit as st
import pickle
import time
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [7]:
risk_report_df = pd.read_excel('../riskReports.xlsx')
urls = list(risk_report_df['blog url'])
local_root_directory = '/Users/stefanduprey/Documents/My_Data/My_RiskBotData/'


In [9]:
process_url_clicked = False
if process_url_clicked:
    loader = UnstructuredURLLoader(urls=urls)
    data = loader.load()
    # split data
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    docs = text_splitter.split_documents(data)
    print('saving preprocessed URLs')
    with open(local_root_directory +'preprocessed_urls.pkl', 'wb') as handle:
        pickle.dump(docs, handle)



saving preprocessed URLs


In [10]:
with open(local_root_directory +'preprocessed_urls.pkl', 'rb') as handle:
    docs=pickle.load(handle)

In [14]:
print(len(docs))
print(docs[0])

1071
page_content='Back to blog\n\nADA Earn Strategy - Risk Report - Kiln\n\nMatteo Bonato\n\nSenior Investment Manager\n\nStaking via Kiln\n\nKey Takeaways\n\nStaking represents one of the safest options in DeFi to generate yield.\n\nKiln is considered a secure and trustable staking service.\n\nThe yield (APY) is derived from staking ADA tokens – contributing to the Proof-of-Stake consensus mechanism of the Cardano blockchain.\n\nThe Cardano blockchain has existed since 2017 and, prior to the Merge, was the largest cryptocurrency to use a proof-of-stake consensus mechanism. Cardano was founded by Charles Hoskinson, who was also one of the co-founders of the Ethereum network. He is the CEO of IOHK, the company that built Cardano’s blockchain.\n\nADA liquid staking represents a liquidity yield-generating investment. No lockup period is present for this strategy but a 25-day warmup is required before being able to start getting the staking rewards and/or unstake the deposited assets.' me

In [18]:
df= pd.DataFrame().from_dict({'text':[doc.page_content for doc in docs], 'source':[doc.metadata['source'] for doc in docs]})
df.to_pickle(local_root_directory +'url_df.pkl')

In [20]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy (from sentence_transformers)
  Downloading numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (5.6 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[33mDEPRECATION: dropbox 11.27.0 has a non-standard dependency specifier stone>=2.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of dropbox or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/

In [21]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df.text)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [35]:
vectors.shape
import numpy as np
np.save(local_root_directory+'vectors.np',vectors)
with open(local_root_directory+'vectors.np', 'wb') as f:
    np.save(f, vectors)


In [36]:
with open(local_root_directory+'vectors.np', 'rb') as f:
    vectors = np.load(f)
print(vectors.shape)

(1071, 768)


In [37]:
import faiss
dim = vectors.shape[1]
index = faiss.IndexFlatL2(dim)

In [38]:
index.add(vectors)

In [40]:
search_query = "Is ADA earn a risk free strategy"
# search_query = "looking for places to visit during the holidays"
# search_query = "An apple a day keeps the doctor away"
vec = encoder.encode(search_query)
vec.shape

(768,)

In [41]:
svec = np.array(vec).reshape(1,-1)
svec.shape

(1, 768)

In [43]:
distances, I = index.search(svec, k=2)
distances

array([[0.72165227, 0.72965634]], dtype=float32)

In [44]:
row_indices = I.tolist()[0]
row_indices

[48, 40]

In [45]:
df.loc[row_indices]

Unnamed: 0,text,source
48,"Last, ADA staking represents a very liquid yie...",https://swissborg.com/blog/ada-risk-report
40,Back to blog\n\nADA Earn Strategy - Risk Repor...,https://swissborg.com/blog/ada-risk-report
