In [2]:
pip install pandas langchain-text-splitters langchain-community langgraph langchain-openai langchain-chroma CSVLoader sentence-transformers faiss-cpu


Note: you may need to restart the kernel to use updated packages.


In [3]:
import getpass
import os
import getpass
import os
import bs4
from langchain.chat_models import init_chat_model
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict 
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import CSVLoader
from langchain_openai import ChatOpenAI
from langchain.schema import Document
import pandas as pd

USER_AGENT environment variable not set, consider setting it to identify your requests.


<H1>Initialising chatmodel and embeddings</H1>


In [4]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "***REMOVED***"

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")
    
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



Enter API key for OpenAI:  Â·Â·Â·Â·Â·Â·Â·Â·


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


<H1>Chroma DB </H1>

In [5]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

<H1>Loader and RAG pipeline</H1>

In [6]:
df = pd.read_csv("/Users/hasheerama/SWMproject/techcrunch_articles.csv")

docs = []
for _, row in df.iterrows():
    docs.append(
        Document(
            page_content=row["content"],          # main text for embeddings
            metadata={
                "title": row["title"],
                "url": row["url"],
                "author": row["author"],
                "date": row["date"]
            }
        )
    )


# 2. Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# 3. Build embeddings & FAISS index
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(all_splits, embeddings)

# 4. Pull prebuilt RAG prompt
prompt = hub.pull("rlm/rag-prompt")

# 5. Define pipeline state
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    sources: List[str]

# 6. Retrieval step
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"], k=4)
    return {"context": retrieved_docs}

# 7. Generation step
llm = ChatOpenAI(model="gpt-4o")  # or gpt-4o, gpt-3.5-turbo

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    
    # Collect sources (deduplicate)
    sources = list({doc.metadata.get("url", "Unknown source") for doc in state["context"]})
    sources_text = "\n".join(f"- {src}" for src in sources)

    # Inject both context + sources into prompt
    messages = prompt.invoke({
        "question": state["question"], 
        "context": docs_content,
        "sources": sources_text
    })

    response = llm.invoke(messages)

    return {
        "answer": response.content,
        "sources": sources
    }

# 8. Build the graph
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# 9. Test a query
if __name__ == "__main__":
    result = graph.invoke({"question": "What companies did Sam altman invest in"})
    print("\nðŸ”Ž QUESTION: Whats the latest news about startups\n")
    print("ðŸ’¡ ANSWER:", result["answer"])
    
    if "sources" in result:
        print("\nðŸ“Œ SOURCES:")
        for src in result["sources"]:
            print("  â€¢", src)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



ðŸ”Ž QUESTION: Whats the latest news about startups

ðŸ’¡ ANSWER: Sam Altman has invested in several companies including Airbnb and Stripe.

ðŸ“Œ SOURCES:
  â€¢ https://techcrunch.com/2025/09/22/elad-gil-one-of-techs-sharpest-minds-on-early-bets-breakout-growth-and-whats-coming-next-at-techcrunch-disrupt-2025/
  â€¢ https://techcrunch.com/2025/09/18/numeral-raises-35m-to-automate-sales-tax-with-ai/
  â€¢ https://techcrunch.com/2025/09/25/steph-currys-vc-firm-just-backed-an-ai-startup-that-wants-to-fix-food-supply-chains/
  â€¢ https://techcrunch.com/2025/09/14/openai-board-chair-bret-taylor-says-were-in-an-ai-bubble-but-thats-ok/
