# Test

In [1]:
from langchain.document_loaders import TextLoader

In [2]:
loader = TextLoader("test.txt")
data = loader.load()

In [3]:
data

[Document(page_content="This is just a test file\nto check how Langchain's TextLoader document loader works.\n\nI hope this file suffices for my practice.", metadata={'source': 'test.txt'})]

In [4]:
data[0]

Document(page_content="This is just a test file\nto check how Langchain's TextLoader document loader works.\n\nI hope this file suffices for my practice.", metadata={'source': 'test.txt'})

In [5]:
type(data[0])

langchain_core.documents.base.Document

In [6]:
data[0].metadata

{'source': 'test.txt'}

In [7]:
data[0].page_content

"This is just a test file\nto check how Langchain's TextLoader document loader works.\n\nI hope this file suffices for my practice."

In [8]:
from langchain.document_loaders import CSVLoader

In [9]:
loader = CSVLoader("test.csv")
data = loader.load()
data

[Document(page_content='id: 1\nname: KGF\nrating: 4.4\ndate: 01-06-2022', metadata={'source': 'test.csv', 'row': 0}),
 Document(page_content='id: 2\nname: Kung Fu Panda\nrating: 4.8\ndate: 28-03-2021', metadata={'source': 'test.csv', 'row': 1}),
 Document(page_content='id: 3\nname: Baadshaah\nrating: 3.9\ndate: 09-11-2023', metadata={'source': 'test.csv', 'row': 2})]

In [10]:
data[0].metadata

{'source': 'test.csv', 'row': 0}

In [11]:
loader = CSVLoader("test.csv", source_column = "name")
data = loader.load()
len(data)

3

In [12]:
data

[Document(page_content='id: 1\nname: KGF\nrating: 4.4\ndate: 01-06-2022', metadata={'source': 'KGF', 'row': 0}),
 Document(page_content='id: 2\nname: Kung Fu Panda\nrating: 4.8\ndate: 28-03-2021', metadata={'source': 'Kung Fu Panda', 'row': 1}),
 Document(page_content='id: 3\nname: Baadshaah\nrating: 3.9\ndate: 09-11-2023', metadata={'source': 'Baadshaah', 'row': 2})]

# Unstructured URL Loading

In [13]:
from langchain.document_loaders import UnstructuredURLLoader

In [14]:
loader = UnstructuredURLLoader(urls = [
    "https://www.moneycontrol.com/news/business/markets/myth-of-stock-market-eating-into-bank-deposits-foreign-investors-fomo-mgl-star-health-prestige-in-focus-bears-gun-for-ujjivan-small-fin-12755470.html",
    "https://www.moneycontrol.com/news/business/markets/clsa-ubs-bullish-on-zomato-stock-morgan-stanley-recommends-buying-on-dips-12755548.html"
])
data = loader.load()

In [15]:
data

[Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nMoneycontrol Trending Stock\n\nInfosys\xa0INE009A01021, INFY, 500209\n\nState Bank of India\xa0INE062A01020, SBIN, 500112\n\nYes Bank\xa0INE528G01027, YESBANK, 532648\n\nBank Nifty\n\nNifty 500\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, Login Hello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistFREE Credit Score₹100 Cash RewardMy AlertsMy MessagesPrice AlertsMy Profile My PROMy PortfolioMy WatchlistFREE Credit Score₹100 Cash RewardMy AlertsMy MessagesPrice AlertsLogoutChat with UsDownload AppFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\nBudget 2        24MarketsHOMEINDIAN INDICESSTOCK ACTIONAll StatsTop GainersTop LosersOnly BuyersOnly Sellers52 Week High52 Week LowPrice ShockersVolume ShockersMost Active StocksGLOBAL MARKETSUS MARKETSSEASONALITY ANALYSISSTOCK SCANNERECONOMIC INDICATORSE

# Text Splitting

In [16]:
from langchain.text_splitter import CharacterTextSplitter

In [17]:
test_text = """Full job description

Statfinity is looking for an Artificial Intelligence (AI) Engineer to join its dynamic team.

Required skills:

    Expertise in using Python (Functions, Loops, AI & ML Libraries) and advanced SQL for data manipulation.
    Hands-on experience in GenAI technologies such as OpenAI, Azure AI, Langchain, TensorFlow, PyTorch, Hugging Face, etc.
    Demonstrable ability to formulate prompt engineering.
    Experienced in developing end-to-end AI models.
    Effective communication and collaboration skills to work with cross-functional teams.
    Good to have experience with natural language processing (NLP), computer vision, chatbots, and image processing.

If you are someone who has the attitude of taking ownership of the problem and curiosity to learn/explore new technologies in the world of AI with minimal supervision in a remote environment, we want to talk to you.

Job Types: Full-time, Permanent

Pay: ₹25,000.00 - ₹45,000.00 per month

Benefits:

    Internet reimbursement
    Paid sick time
    Paid time off
    Work from home

Schedule:

    Day shift
    Monday to Friday

Supplemental pay types:

    Performance bonus
    Quarterly bonus
    Yearly bonus

Application Question(s):

    Tell us about your experience with OpenAI / Langchain.
    Describe a project you did using LLMs.

Education:

    Bachelor's (Preferred)

Experience:

    Generative AI: 1 year (Required)

Work Location: Remote"""

In [18]:
splitter = CharacterTextSplitter(separator = "\n", chunk_size = 200, chunk_overlap = 0)

In [19]:
chunks = splitter.split_text(test_text)
len(chunks)

Created a chunk of size 215, which is longer than the specified 200


9

In [20]:
for chunk in chunks:
    print(len(chunk))

130
103
176
137
112
215
187
183
147


In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [22]:
splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],
    chunk_size = 200,
    chunk_overlap = 0
)
chunks = splitter.split_text(test_text)
len(chunks)

11

In [23]:
for chunk in chunks:
    print(len(chunk))

132
103
176
137
112
199
15
177
141
188
21


In [24]:
chunks[2]

'Hands-on experience in GenAI technologies such as OpenAI, Azure AI, Langchain, TensorFlow, PyTorch, Hugging Face, etc.\n    Demonstrable ability to formulate prompt engineering.'

# Embeddings

In [25]:
import pandas as pd
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [26]:
df = pd.read_csv("test.csv")
df

Unnamed: 0,id,name,rating,date
0,1,KGF,4.4,01-06-2022
1,2,Kung Fu Panda,4.8,28-03-2021
2,3,Baadshaah,3.9,09-11-2023


In [27]:
encoder = SentenceTransformer("all-mpnet-base-v2")
vectors = encoder.encode(df["name"])
vectors.shape



(3, 768)

In [28]:
dim = vectors.shape[1]

In [29]:
type(vectors)

numpy.ndarray

# FAISS (Facebook AI Similarity Search) Vector Database

In [33]:
import faiss
import numpy as np

In [34]:
index = faiss.IndexFlatL2(dim)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002185B33A910> >

In [35]:
index.add(vectors)
query = "Fighting animal related movie"
query_embed = encoder.encode([query])
query_embed.shape

(1, 768)

In [36]:
index.search(query_embed, 3)   # First item in the tuple represents the distances

(array([[0.9195317, 1.6586064, 1.7280314]], dtype=float32),
 array([[1, 0, 2]], dtype=int64))

In [37]:
df.iloc[index.search(query_embed, 3)[1][0]]

Unnamed: 0,id,name,rating,date
1,2,Kung Fu Panda,4.8,28-03-2021
0,1,KGF,4.4,01-06-2022
2,3,Baadshaah,3.9,09-11-2023


# Retrieval QA With Sources Chain

In [8]:
# from langchain import OpenAI
# from langchain.chains import RetrievalQAWithSourcesChain
# from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import FAISS

from llama_index.llms.gemini import Gemini

In [2]:
import os
import pickle

In [1]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [5]:
import pandas as pd
from datetime import datetime

In [6]:
import faiss

In [7]:
# os.environ["GOOGLE_API_KEY"] = ""

In [8]:
df = pd.DataFrame(columns = ["text", "timestamp", "url"])

In [9]:
df.head()

Unnamed: 0,text,timestamp,url


In [13]:
print(UnstructuredURLLoader(urls = ["a"]).load())

Error fetching or processing a, exception: Invalid URL 'a': No scheme supplied. Perhaps you meant https://a?


[]


In [10]:
url_loader = UnstructuredURLLoader(urls = [
    "https://www.moneycontrol.com/news/business/markets/myth-of-stock-market-eating-into-bank-deposits-foreign-investors-fomo-mgl-star-health-prestige-in-focus-bears-gun-for-ujjivan-small-fin-12755470.html",
    "https://www.moneycontrol.com/news/business/markets/clsa-ubs-bullish-on-zomato-stock-morgan-stanley-recommends-buying-on-dips-12755548.html"
])
data = url_loader.load()
data

[Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nMoneycontrol Trending Stock\n\nInfosys\xa0INE009A01021, INFY, 500209\n\nState Bank of India\xa0INE062A01020, SBIN, 500112\n\nYes Bank\xa0INE528G01027, YESBANK, 532648\n\nBank Nifty\n\nNifty 500\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, Login Hello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistFREE Credit Score₹100 Cash RewardMy AlertsMy MessagesPrice AlertsMy Profile My PROMy PortfolioMy WatchlistFREE Credit Score₹100 Cash RewardMy AlertsMy MessagesPrice AlertsLogoutChat with UsDownload AppFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\nBudget 2        24MarketsHOMEINDIAN INDICESSTOCK ACTIONAll StatsTop GainersTop LosersOnly BuyersOnly Sellers52 Week High52 Week LowPrice ShockersVolume ShockersMost Active StocksGLOBAL MARKETSUS MARKETSSEASONALITY ANALYSISSTOCK SCANNERECONOMIC INDICATORSE

In [11]:
document_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],
    chunk_size = 1000,
    chunk_overlap = 200
)
chunks = document_splitter.split_documents(data)
len(chunks)

39

In [12]:
df = pd.concat([df, pd.DataFrame([[d.page_content, datetime.now(), d.metadata["source"]] for d in chunks], columns = ["text", "timestamp", "url"])], ignore_index = True)
df.head()

  df = pd.concat([df, pd.DataFrame([[d.page_content, datetime.now(), d.metadata["source"]] for d in chunks], columns = ["text", "timestamp", "url"])], ignore_index = True)


Unnamed: 0,text,timestamp,url
0,English\n\nHindi\n\nGujarati\n\nSpecials\n\nMo...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
1,Budget 2 24MarketsHOMEINDIAN INDICESSTO...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
2,NewsOpinionExplainersMC BuzzMC FeaturesMC Lear...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
3,Loan CalculatorCredit Card Debit Payoff Calcul...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
4,chat with Manisha GuptaLet`s Talk JobsThe Tena...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...


In [13]:
chunks[0]

Document(page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nMoneycontrol Trending Stock\n\nInfosys\xa0INE009A01021, INFY, 500209\n\nState Bank of India\xa0INE062A01020, SBIN, 500112\n\nYes Bank\xa0INE528G01027, YESBANK, 532648\n\nBank Nifty\n\nNifty 500\n\nQuotes\n\nMutual Funds\n\nCommodities\n\nFutures & Options\n\nCurrency\n\nNews\n\nCryptocurrency\n\nForum\n\nNotices\n\nVideos\n\nGlossary\n\nAll\n\nHello, Login Hello, LoginLog-inor Sign-UpMy AccountMy Profile My PortfolioMy WatchlistFREE Credit Score₹100 Cash RewardMy AlertsMy MessagesPrice AlertsMy Profile My PROMy PortfolioMy WatchlistFREE Credit Score₹100 Cash RewardMy AlertsMy MessagesPrice AlertsLogoutChat with UsDownload AppFollow us on:\n\nGo Ad-Free\n\nMy Alerts', metadata={'source': 'https://www.moneycontrol.com/news/business/markets/myth-of-stock-market-eating-into-bank-deposits-foreign-investors-fomo-mgl-star-health-prestige-in-focus-bears-gun-for-ujjivan-small-fin-12755470.html'})

In [14]:
gemini_model = Gemini(model = "models/gemini-pro")

In [15]:
resp = gemini_model.complete("Just reply with a short message confirming your ability to understand and interpret my messages.")
print(resp)

Understood. I can interpret and respond to your messages effectively.


In [32]:
str(resp)

'Understood. I can interpret and respond to your messages effectively.'

In [31]:
type(str(resp))

str

In [6]:
encoder = SentenceTransformer("all-mpnet-base-v2")



In [7]:
# with open("encoder.pkl", "wb") as f:
#     pickle.dump(encoder, f)

In [5]:
# os.listdir()

['app.py',
 'Application Architecture v1.png',
 'Application Architecture.excalidraw',
 'secrets.py',
 'test-df.csv',
 'test-vectordb.pkl',
 'test.csv',
 'test.ipynb',
 'test.txt',
 'tranformer.pkl',
 '__init__.py']

In [16]:
vectors = encoder.encode([chunk.page_content for chunk in chunks])
vectors.shape



(39, 768)

In [17]:
dim = vectors.shape[1]
faiss_vectordb = faiss.IndexFlatL2(dim)

In [18]:
faiss_vectordb.add(vectors)
faiss_vectordb

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002B5554484B0> >

In [19]:
with open("test-vectordb.pkl", "wb") as f:
    pickle.dump(faiss_vectordb, f)

In [20]:
df.to_csv("test-df.csv")

In [23]:
check_df = pd.read_csv("test-df.csv", index_col = "Unnamed: 0")
check_df.head()

Unnamed: 0,text,timestamp,url
0,English\n\nHindi\n\nGujarati\n\nSpecials\n\nMo...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
1,Budget 2 24MarketsHOMEINDIAN INDICESSTO...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
2,NewsOpinionExplainersMC BuzzMC FeaturesMC Lear...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
3,Loan CalculatorCredit Card Debit Payoff Calcul...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...
4,chat with Manisha GuptaLet`s Talk JobsThe Tena...,2024-07-10 12:52:56.157645,https://www.moneycontrol.com/news/business/mar...


In [22]:
check_df.shape

(39, 4)

In [24]:
df.loc[0, "url"]

'https://www.moneycontrol.com/news/business/markets/myth-of-stock-market-eating-into-bank-deposits-foreign-investors-fomo-mgl-star-health-prestige-in-focus-bears-gun-for-ujjivan-small-fin-12755470.html'

In [27]:
query = "What is the stock advice related to Prestige Estates?"
encoded_query = encoder.encode([query])
relevant_chunk_indices = faiss_vectordb.search(encoded_query, 3)[1][0]

In [28]:
relevant_chunk_indices

array([14, 38,  5], dtype=int64)

In [34]:
chunk_responses = []
for i in relevant_chunk_indices:
    relevant_chunk = df.loc[i, "text"]
    prompt = f"Answer the following question within double quotes using the text within backticks (`):\n\"{query}\"\n`{relevant_chunk}`"
    chunk_responses.append(str(gemini_model.complete(prompt)))
summarization_prompt = f"Summarize in detail: {" ".join(chunk_responses)}"
print(gemini_model.complete(summarization_prompt))

**Bull Case:**

* Pre-sales growth guidance of 25% YoY by FY25
* Dominant position in Bengaluru, a key real estate market
* Upbeat residential demand
* Cheaper valuations compared to peers

**Bear Case:**

* Muted residential demand
* Potential delays in project execution
* High interest rates
* Recent gains driven by expectations of hospitality arm's IPO, which could impact sentiment if delayed


In [35]:
chunk_responses

['"Bull case: Guidance of pre-sales growth of 25 percent YoY by FY25, supported by its dominant position in Bengaluru, upbeat residential demand, and cheaper valuations compared to peers.\n\nBear case: Muted residential demand, delay in execution of projects, and high interest rates potential risks. Recent gains partly driven by hopes of hospitality arm going public. Any delay there could hit sentiment."',
 '"The provided text does not contain any stock advice related to Prestige Estates."',
 '"I\'m sorry, but the provided text does not contain any stock advice related to Prestige Estates."']