In [1]:
import torch

device = torch.device("cuda")
print("Running on:", torch.cuda.get_device_name(0))

# Big tensors to stress test GPU
a = torch.rand(5000, 5000, device=device)
b = torch.rand(5000, 5000, device=device)

# Matrix multiplication on GPU
c = torch.matmul(a, b)

print("Done, result on:", c.device)


Running on: NVIDIA GeForce RTX 2050
Done, result on: cuda:0


In [15]:
# ------------------------------
# Step 0: Imports
# ------------------------------
import os
import requests
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama




Choose input type:
1. PDF file
2. TXT file
3. Webpage URL
4. Local File Path


Enter choice (1-4):  3
Enter webpage URL:  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/PWMJ9-Npq9FYNSWrrf99YQ/watsonx.txt


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given



✅ Sources Loaded: {'webpage': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/PWMJ9-Npq9FYNSWrrf99YQ/watsonx.txt'}


In [21]:
# ------------------------------
# Step 1: Document Wrapper
# ------------------------------
class Document:
    def __init__(self, metadata, page_content):
        self.metadata = metadata
        self.page_content = page_content

# Dictionary to remember what sources were loaded
SOURCES_DICTIONARY = {}



In [None]:
# ------------------------------
# Step 2: Load Documents
# ------------------------------
documents = []

print("\nChoose input type:")
print("1. PDF file")
print("2. TXT file")
print("3. Webpage URL")
print("4. Local File Path")
choice = input("Enter choice (1-4): ")

if choice == "1":
    filepath = input("Enter PDF path: ")
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    for d in docs:
        documents.append(Document(metadata={"source": filepath}, page_content=d.page_content))
    SOURCES_DICTIONARY["pdf"] = filepath

elif choice == "2":
    url = input("Enter TXT file URL: ")
    response = requests.get(url)
    if response.status_code == 200:
        documents.append(Document(metadata={"source": url}, page_content=response.text))
        SOURCES_DICTIONARY["txt_url"] = url
    else:
        print("Failed to fetch TXT file.")

elif choice == "3":
    url = input("Enter webpage URL: ")
    loader = WebBaseLoader(url)
    docs = loader.load()
    for d in docs:
        documents.append(Document(metadata={"source": url}, page_content=d.page_content))
    SOURCES_DICTIONARY["webpage"] = url

elif choice == "4":
    filepath = input("Enter local file path (.txt or .pdf): ")
    if filepath.endswith(".pdf"):
        loader = PyPDFLoader(filepath)
    else:
        loader = TextLoader(filepath, encoding="utf-8")
    docs = loader.load()
    for d in docs:
        documents.append(Document(metadata={"source": filepath}, page_content=d.page_content))
    SOURCES_DICTIONARY["local_file"] = filepath

else:
    print("Invalid choice.")

print("\n✅ Sources Loaded:", SOURCES_DICTIONARY)




Choose input type:
1. PDF file
2. TXT file
3. Webpage URL
4. Local File Path


In [None]:
# ------------------------------
# Step 3: Split Text into Chunks (with metadata preserved)
# ------------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

split_docs = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        split_docs.append(Document(metadata=doc.metadata, page_content=chunk))



In [None]:
# ------------------------------
# Step 4: Create Vector Store with Ollama Embeddings
# ------------------------------
embeddings = OllamaEmbeddings(model="mistral")  # or "llama2", "nomic-embed-text", etc.
vectorstore = Chroma.from_documents(
    documents=split_docs,       # ✅ pass Document objects, not dicts
    embedding=embeddings,
    persist_directory="./chroma_store"
)


https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/PWMJ9-Npq9FYNSWrrf99YQ/watsonx.txt


In [17]:
# ------------------------------
# Step 5: Build QA Chain
# ------------------------------
from langchain_community.llms import Ollama

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = Ollama(model="mistral")

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)



In [20]:
# ------------------------------
# Step 6: Ask Questions
# ------------------------------
while True:
    query = input("\nAsk a question (or type 'exit' to quit): ")
    if query.lower() == "exit":
        break

    # Retrieve docs
    retrieved_docs = retriever.get_relevant_documents(query)

    if retrieved_docs and any(doc.page_content.strip() for doc in retrieved_docs):
        result = qa_chain({"query": query})

        # If context didn't answer the actual question, fallback to Ollama
        if query.lower() not in result["result"].lower():
            response = llm.invoke(query)
            print("\n📌 General Knowledge Answer:\n", response)
            print("🔗 Sources used: None (general knowledge)")
        else:
            print("\nAnswer from documents:\n", result["result"])
            print("\n🔗 Sources used:", [doc.metadata.get("source", "N/A") for doc in result["source_documents"]])
    else:
        # No relevant docs, just use Ollama
        response = llm.invoke(query)
        print("\n📌 General Knowledge Answer:\n", response)
        print("🔗 Sources used: None (general knowledge)")



Ask a question (or type 'exit' to quit):  what is moon



📌 General Knowledge Answer:
  The Moon is a natural satellite that orbits the Earth. It's the fifth largest moon in the solar system and the only celestial body, aside from Earth, to be inhabited by life. It's approximately one-quarter of the size of Earth and has no atmosphere. Its surface is covered with many different types of rock formations that scientists have studied extensively using telescopes on Earth, as well as rovers and landers sent by space agencies from around the world. The Moon plays a significant role in our daily lives, affecting tides due to its gravitational pull, and has been a source of inspiration for humans throughout history.
🔗 Sources used: None (general knowledge)



Ask a question (or type 'exit' to quit):  tell me about ibm



📌 General Knowledge Answer:
  IBM (International Business Machines Corporation) is an American multinational technology company headquartered in Armonk, New York. It was founded in 1911 and has a long history of innovation and contribution to the field of computing.

IBM is known for its hardware, software, and consulting services. Some of its notable inventions include the floppy disk, the hard disk drive, the magnetic stripe card reader, the relational database, and the UPC barcode scanner.

In terms of software, IBM has developed several significant products such as DB2 (a relational database management system), WebSphere (a Java EE-based application server), and Rational Rose (an integrated modeling tool for systems design).

IBM's services include IT consulting, cloud services, AI and quantum computing, blockchain, and cybersecurity solutions. The company is also known for its corporate social responsibility efforts, including initiatives in education, the environment, and commun


Ask a question (or type 'exit' to quit):  exit
