In [1]:
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import TextLoader, WebBaseLoader, PyPDFLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import bs4

USER_AGENT environment variable not set, consider setting it to identify your requests.


### Data Ingestion

In [2]:
loader = TextLoader("speech.txt")
text_docs = loader.load()
text_docs

[Document(metadata={'source': 'speech.txt'}, page_content='Today, I want to talk about something that seems simple but defines the direction of our livesâ€”discipline. Discipline is not just about rules or restrictions; it is about self-control, consistency, and the ability to choose long-term growth over short-term comfort.\n\nWhen we practice discipline, we train ourselves to stay focused, to work hard even when no one is watching, and to keep going when things get difficult. It is discipline that turns talent into success and dreams into reality.\n\nIn life, we cannot control every outcome, but we can control our habits and our actions. And when discipline becomes a habit, success is no longer a question of if, but when.\n\nSo, letâ€™s remind ourselves daily: freedom and success are not found in avoiding disciplineâ€”they are achieved through it.\n\nThank you.')]

In [8]:
load_dotenv()

os.environ["XAI_API_KEY"] = os.getenv("XAI_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
## Web based loader
loader = WebBaseLoader(
    web_path="https://langchain-ai.github.io/langgraph/tutorials/rag/langgraph_agentic_rag/", 
    bs_kwargs=dict(parse_only = bs4.SoupStrainer(
        class_ = ("md-container", "md-header")
    )),
    )

text_docs = loader.load()

In [5]:
## PDF reader
loader = PyPDFLoader('lec_6.pdf')
text_docs = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
docs = text_splitter.split_documents(text_docs)

In [10]:
## Vector Embeddings & store

db = Chroma.from_documents(docs[:20], OpenAIEmbeddings())


In [14]:
query = "RNN"
result = db.similarity_search(query, k=2)
result

[Document(metadata={'creationdate': '2024-03-19T21:25:11-05:00', 'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'page_label': '3', 'page': 2, 'title': 'Microsoft PowerPoint - 3_Self_Attention_Transformer', 'total_pages': 56, 'moddate': '2024-03-19T21:25:11-05:00', 'source': 'lec_6.pdf', 'author': 'Shaurya Tripathi'}, page_content='3/19/2024\n3\nIssues with Recurrent Attention•Scalability issues•Performance degrades as the distance between words increases•Parallelization limitations•Recurrent processes lacks ability to be parallelized•Memory constraints•RNNs have limited memory and struggle with long-range dependencies•Diluted impact of earlier elements on output as sequence progresses•Potential solution: decouple attention from RNNs•How? Separate the attention mechanism into smaller, self-contained components5Slide Credit : https://cs.uwaterloo.ca/~wenhuche/teaching/cs886/\nSelf -ATTENTION5\n6'),
 Document(metadata={'creator': 'PyPDF', 'source': 'lec_6.pdf', 'creationdate':