In [17]:
!pip install \
    langchain_community\
    langchain_text_splitters\
    langchain_openai\
    langchain_pinecone




In [18]:
!pip install pypdf



In [19]:
import os
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

In [20]:
from urllib.request import urlretrieve

In [23]:
files = [
    "https://www.deeplearningbook.org/front_matter.pdf",
    "https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf",
    "https://www.academia.dk/BiologiskAntropologi/Epidemiologi/DataMining/Artificial_Intelligence-A_Guide_to_Intelligent_Systems.pdf",
    "https://mrce.in/ebooks/AI%20Foundations%20of%20Computational%20Agents%203rd%20Ed.pdf"
]
os.makedirs("data",exist_ok=True)

In [24]:
for url in files:
    file_path=os.path.join("data", url.rpartition("/")[2])
    urlretrieve(url,file_path)

In [25]:
loader = PyPDFDirectoryLoader("./data/")

In [26]:
docs_before_split=loader.load()

In [27]:
docs_before_split[0]

Document(metadata={'source': 'data/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf', 'page': 0}, page_content='')

In [28]:
from google.colab import userdata
os.environ['OPENAI_API_KEY']=userdata.get('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY']=userdata.get('PINECONE_API_KEY')

In [29]:
embeddings =OpenAIEmbeddings(
    model="text-embedding-3-small",
)

index_name="chatbot"

In [30]:
text_splitter=RecursiveCharacterTextSplitter()
split_docs=text_splitter.split_documents(docs_before_split)

In [31]:
split_docs[25]

Document(metadata={'source': 'data/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf', 'page': 27}, page_content='8 1. INTRODUCTION\nFigure 1.5 Graphs of the root-mean-square\nerror, deﬁned by (1.3), evaluated\non the training set and on an inde-\npendent test set for various values\nof M.\nM\nERMS\n0 3 6 9\n0\n0.5\n1\nTraining\nTest\nFor M =9 , the training set error goes to zero, as we might expect because\nthis polynomial contains10 degrees of freedom corresponding to the10 coefﬁcients\nw0,...,w 9, and so can be tuned exactly to the10 data points in the training set.\nHowever, the test set error has become very large and, as we saw in Figure 1.4, the\ncorresponding function y(x,w⋆ ) exhibits wild oscillations.\nThis may seem paradoxical because a polynomial of given order contains all\nlower order polynomials as special cases. TheM =9 polynomial is therefore capa-\nble of generating results at least as good as theM =3 polynomial. Furthermore, we\nmight suppose that the best p

In [32]:
vectorestore=PineconeVectorStore.from_documents(split_docs,embeddings,index_name=index_name)

In [33]:
query = "What is the neural network?"

In [34]:
similar_docs=vectorestore.similarity_search(query)

In [35]:
similar_docs

[Document(id='233191a0-330f-42f1-acdc-730a60c1afe3', metadata={'page': 184.0, 'source': 'data\\Artificial_Intelligence-A_Guide_to_Intelligent_Systems.pdf'}, page_content='What is a neural network?\nA neural network can be deﬁned as a model of reasoning based on the human\nbrain. The brain consists of a densely interconnected set of nerve cells, or basic\ninformation-processing units, calledneurons. The human brain incorporates\nnearly 10 billion neurons and 60 trillion connections, synapses, between them\n(Shepherd and Koch, 1990). By using multiple neurons simultaneously, the\nbrain can perform its functions much faster than the fastest computers in\nexistence today.\nAlthough each neuron has a very simple structure, an army of such elements\nconstitutes a tremendous processing power. A neuron consists of a cell body,'),
 Document(id='64c0dea0-6741-4d11-b7ca-c3e9fb9bb90b', metadata={'page': 184.0, 'source': 'data/Artificial_Intelligence-A_Guide_to_Intelligent_Systems.pdf'}, page_conte

In [36]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm= ChatOpenAI(
    model="gpt-4o",
    temperature=0.3
)

In [37]:
qa= RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorestore.as_retriever()
)

In [38]:
qa.invoke(query)

{'query': 'What is the neural network?',
 'result': 'A neural network is a model of reasoning based on the human brain. It consists of a densely interconnected set of nerve cells, or basic information-processing units, called neurons. The human brain incorporates nearly 10 billion neurons and 60 trillion connections, synapses, between them. By using multiple neurons simultaneously, the brain can perform its functions much faster than the fastest computers in existence today. Neural networks exhibit plasticity, meaning they can change the strength of their connections and form new connections in response to stimulation patterns, which forms the basis for learning. This ability to learn through experience has led to attempts to emulate biological neural networks in computers.'}

In [39]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.4 (from gradio)
  Downloading gradio_client-1.5.4-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [None]:
import gradio as gr
from langchain.chains import ConversationChain

conversation_chain = ConversationChain(llm=llm)

def chat_with_bot(user_message, history):
    if not user_message.strip():
        return gr.update(value="Message cannot be empty!"), history

    # Get response from LangChain
    response = conversation_chain.predict(input=user_message)

    # Add to history for context
    history.append((user_message, response))

    return gr.update(value=""), history

with gr.Blocks() as app:
    gr.Markdown("# AI Learning Chatbot")

    chatbot = gr.Chatbot(label="Chat Interface")
    user_input = gr.Textbox(label="Your Message", placeholder="Type your question here...", lines=1)
    clear_button = gr.Button("Clear Chat")
    chat_history = []

    def user(user_message, history):
        if not user_message.strip():
            return gr.update(value="Message cannot be empty!"), history

    user_input.submit(chat_with_bot, [user_input, chatbot], [user_input, chatbot], queue=True)
    clear_button.click(lambda: None, None, chatbot, queue=False)


if __name__ == "__main__":
    app.launch(debug=True)




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5c9a1c01fc9395c111.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
