In [None]:
#Question Answering system using a pdf\
# Below is testing code

In [None]:
#Install the libraries
!pip install -q langchain
!pip install -q langchain_community
!pip install -q langchain_openai
!pip install -q openai==1.57.0
!pip install -q langchain_core
!pip install -q pypdf
!pip install -q chromadb
!pip install Flask==2.3.2


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 5

In [None]:
##Imports for Baseline QA Pipeline
from langchain.document_loaders import PyPDFLoader # for loading the pdf
from langchain_openai import OpenAIEmbeddings # for creating embeddings
from langchain.vectorstores import Chroma # for the vectorization part
from langchain.chains import RetrievalQA #For the retrieval QA chain part
from langchain_openai import ChatOpenAI #for getting an LLM for QA chain
#from langchain_core.output_parsers import StrOutputParser #Not used currently, leaving, as can be used for parsing output from LLM
#from langchain_core.runnables import RunnablePassthrough #Not used currently, leaving, as can be used for getting LLM output
from langchain.prompts import ChatPromptTemplate #for setting up prompts

In [None]:
#Setup openai key
import os
import openai
from getpass import getpass
OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
#Download a sample pdf
!curl https://www.mrbigler.com/downloads/Notes-Physics-1.pdf >Notes_Physics.pdf
!curl https://app.onecompiler.com/42z9je4e9_42zkvjjnh/ > index.html

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.8M  100 10.8M    0     0  18.6M      0 --:--:-- --:--:-- --:--:-- 18.6M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    69    0    69    0     0    241      0 --:--:-- --:--:-- --:--:--   242


In [None]:
#Setup Base QA system pipeline
class BaseQAPipeline:
    def __init__(self):
        self.doc = "tutor_textbook.pdf"
        self.loader = PyPDFLoader(self.doc)

        # Load the document and store it in the 'data' variable
        self.data = self.loader.load_and_split()

        self.embeddings = OpenAIEmbeddings()
        self.vectordb = Chroma.from_documents(self.data, embedding=self.embeddings,
                                 persist_directory=".")

        # Initialize a language model with ChatOpenAI
        self.llm = ChatOpenAI(model_name= 'gpt-3.5-turbo', temperature=0.6)

        #Setup a prompt template
        template = """\
        You are an assistant for question-answering tasks.

        Use the following pieces of retrieved context to answer the question.

        If you don't know the answer, just say that you don't know.

        Use three sentences maximum and keep the answer concise.

        Question: {question}

        Context: {context}

        Answer:

        """

        prompt = ChatPromptTemplate.from_template(template)

        chain_type_kwargs = {"prompt": prompt}



        # 1. Vectorstore-based retriever
        self.vectorstore_retriever = self.vectordb.as_retriever()

        # Initialize a RetrievalQA chain with the language model and vector database retriever
        self.qa_chain = RetrievalQA.from_chain_type(self.llm, retriever= self.vectorstore_retriever, chain_type_kwargs=chain_type_kwargs)


    def invoke(self, input_dict):
        question = input_dict.get("question")
        context = input_dict.get("context")
        result = self.qa_chain.invoke({"query": question}, {"context": context})
        return result



In [None]:
from flask import Flask, render_template, request, redirect, url_for
from werkzeug.utils import secure_filename
import os  # For file path management


filepath = "./tutor_textbook.pdf"
ALLOWED_EXTENSIONS = {'txt', 'pdf', 'docx', 'png', 'jpg', 'jpeg', 'gif'}
app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def index():
    global url_data, prompt_data  # Access global variables

    if request.method == "POST":
        url_data = request.form.get("url")
        print("URL: ", url_data)
        if 'file' not in request.files:
            print('No file uploaded!')
        else:
          file = request.files['file']
          file.save(filepath)
          print("File saved:", filepath)
        if (url_data != ""):
            !curl {url_data} > tutor_textbook.pdf
        print("File: ",file)
        prompt_data = request.form.get("prompt")
        base_qa_pipeline = BaseQAPipeline()
        result = base_qa_pipeline.invoke({'question' : prompt_data})
        print(result)
        return render_template("index.html", result=result)

    return render_template("index.html")

@app.route('/how-it-works', methods=['GET'])
def how_it_works():
    return render_template('how-it-works.html')

@app.route('/generate-plan', methods=['GET'])
def generate_plan():
    return render_template('generate-plan.html')


from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))


if __name__ == "__main__":
    app.run()

https://qg8uz4v9w7d-496ff2e9c6d22116-5000-colab.googleusercontent.com/
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:13] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:14] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:20] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:20] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:29] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:29] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:17:35] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:29:04] "[33mGET /how-it-works?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:29:06] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jan/2025 20:29:07] "[33mGET /favicon.ico?

In [None]:
base_qa_pipeline = BaseQAPipeline()
result = base_qa_pipeline.invoke({'question' : 'What is momentum?'})
print(result)

AttributeError: module 'openai' has no attribute 'OpenAI'

In [None]:
# Final code with everything attached.
#Question Answering system using a pdf
#Install the libraries
!pip install -q langchain
!pip install -q langchain_community
!pip install -q langchain_openai
!pip install -q openai
!pip install -q langchain_core
!pip install -q pypdf
!pip install -q chromadb
!pip install Flask==2.3.2
!pip install --force-reinstall -v openai==1.57.0
!python3 -m pip install --upgrade httpx
##Imports for Baseline QA Pipeline
from langchain.document_loaders import PyPDFLoader # for loading the pdf
from langchain_openai import OpenAIEmbeddings # for creating embeddings
from langchain.vectorstores import Chroma # for the vectorization part
from langchain.chains import RetrievalQA #For the retrieval QA chain part
from langchain_openai import ChatOpenAI #for getting an LLM for QA chain
#from langchain_core.output_parsers import StrOutputParser #Not used currently, leaving, as can be used for parsing output from LLM
#from langchain_core.runnables import RunnablePassthrough #Not used currently, leaving, as can be used for getting LLM output
from langchain.prompts import ChatPromptTemplate #for setting up prompts
#Setup openai key
import os
import openai
from getpass import getpass
print("Please enter Open AI KEY")
OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
#Setup Base QA system pipeline
class BaseQAPipeline:
    def __init__(self):
        self.doc = "tutor_textbook.pdf"
        self.loader = PyPDFLoader(self.doc)

        # Load the document and store it in the 'data' variable
        self.data = self.loader.load_and_split()

        self.embeddings = OpenAIEmbeddings()
        self.vectordb = Chroma.from_documents(self.data, embedding=self.embeddings,
                                 persist_directory=".")

        # Initialize a language model with ChatOpenAI
        self.llm = ChatOpenAI(model_name= 'gpt-3.5-turbo', temperature=0.6)

        #Setup a prompt template
        template = """\
        You are an assistant for question-answering tasks.

        Use the following pieces of retrieved context to answer the question.

        If either the PDF or the question are not related to each other or not
        related to any educational standard, state the following: This content is
        not related to any educational purposes.

        For example, if topics are not the same, like a java textbook is given,
        however, the user asks about a physics question, state the following: This
        content is not related to the inputted textbook, please select another textbook
        and try again.

        If you don't know the answer, just say that you don't know.

        Use three sentences maximum and keep the answer concise.

        Question: {question}

        Context: {context}

        Answer:

        """

        prompt = ChatPromptTemplate.from_template(template)

        chain_type_kwargs = {"prompt": prompt}


        # 1. Vectorstore-based retriever
        self.vectorstore_retriever = self.vectordb.as_retriever()

        # Initialize a RetrievalQA chain with the language model and vector database retriever
        self.qa_chain = RetrievalQA.from_chain_type(self.llm, retriever= self.vectorstore_retriever, chain_type_kwargs=chain_type_kwargs)
        self.chat_history = []  # Initialize chat history

    def update_chat_history(self, question, answer):
        self.chat_history.append({"question": question, "answer": answer})

    def build_combined_context(self):
        """Combine chat history and document context."""
        # Combine all previous chat history
        chat_context = "\n".join([f"Q: {entry['question']}\nA: {entry['answer']}" for entry in self.chat_history])

        # Fetch relevant context from the vector store based on the current question
        if self.chat_history:
            current_question = self.chat_history[-1]['question']
            context_from_db = self.vectorstore_retriever.get_relevant_documents(current_question)
        else:
            context_from_db = self.vectorstore_retriever.get_relevant_documents("")

        # Convert the list of context documents into a string
        context_str = "\n".join([doc.page_content for doc in context_from_db])

        # Combine both chat history and the document context
        combined_context = f"Chat history:\n{chat_context}\n\nContext from the document:\n{context_str}"

        return combined_context


    def invoke(self, input_dict):
        question = input_dict.get("question")
        combined_context = self.build_combined_context()

        result = self.qa_chain.invoke({
            "query": question,
            "context": combined_context
        })

        self.update_chat_history(question, result['result'])
        return result
#Setup GenerateStudyPlan pipeline
class GenerateStudyPlan:
    def __init__(self):
        self.doc = "tutor_textbook.pdf"
        self.loader = PyPDFLoader(self.doc)

        # Load the document and store it in the 'data' variable
        self.data = self.loader.load_and_split()

        self.embeddings = OpenAIEmbeddings()
        self.vectordb = Chroma.from_documents(self.data, embedding=self.embeddings,
                                 persist_directory=".")

        # Initialize a language model with ChatOpenAI
        self.llm = ChatOpenAI(model_name= 'gpt-3.5-turbo', temperature=0.6)

        #Setup a prompt template
        template = """\
            You are an assistant for generating study plans on a singular subject.

        Use the following pieces of retrieved context to answer the question.

        If the user has given a topic to study or topics that they need focus on,
        make the plan more focused on those topics.

        For the study plan, give 10 different questions. It must all be related to
        the topic. The questions should be made with the textbook content.

        Question: {question}

        Context: {context}

        Answer:

        """

        prompt = ChatPromptTemplate.from_template(template)

        chain_type_kwargs = {"prompt": prompt}


        # 1. Vectorstore-based retriever
        self.vectorstore_retriever = self.vectordb.as_retriever()

        # Initialize a RetrievalQA chain with the language model and vector database retriever
        self.qa_chain = RetrievalQA.from_chain_type(self.llm, retriever= self.vectorstore_retriever, chain_type_kwargs=chain_type_kwargs)
        self.chat_history = []  # Initialize chat history

    def update_chat_history(self, question, answer):
        self.chat_history.append({"question": question, "answer": answer})

    def build_combined_context(self):
        """Combine chat history and document context."""
        # Combine all previous chat history
        chat_context = "\n".join([f"Q: {entry['question']}\nA: {entry['answer']}" for entry in self.chat_history])

        # Fetch relevant context from the vector store based on the current question
        if self.chat_history:
            current_question = self.chat_history[-1]['question']
            context_from_db = self.vectorstore_retriever.get_relevant_documents(current_question)
        else:
            context_from_db = self.vectorstore_retriever.get_relevant_documents("")

        # Convert the list of context documents into a string
        context_str = "\n".join([doc.page_content for doc in context_from_db])

        # Combine both chat history and the document context
        combined_context = f"Chat history:\n{chat_context}\n\nContext from the document:\n{context_str}"

        return combined_context


    def invoke(self, input_dict):
        question = input_dict.get("question")
        combined_context = self.build_combined_context()

        result = self.qa_chain.invoke({
            "query": question,
            "context": combined_context
        })

        self.update_chat_history(question, result['result'])
        return result

from flask import Flask, render_template, request, redirect, url_for
import markdown

filepath = "./tutor_textbook.pdf"
ALLOWED_EXTENSIONS = {'txt', 'pdf', 'docx', 'png', 'jpg', 'jpeg', 'gif'}
app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def index():
    global url_data, prompt_data  # Access global variables

    if request.method == "POST":
        url_data = request.form.get("url")
        print("URL: ", url_data)
        if 'file' not in request.files:
            print('No file uploaded!')
        else:
          file = request.files['file']
          file.save(filepath)
          print("File saved:", filepath)
        if (url_data != ""):
            !curl {url_data} > tutor_textbook.pdf
        print("File: ",file)
        prompt_data = request.form.get("prompt")
        base_qa_pipeline = BaseQAPipeline()
        result = base_qa_pipeline.invoke({'question' : prompt_data})
        print(result)
        return render_template("index.html", result=result)

    return render_template("index.html")

@app.route('/how-it-works', methods=['GET'])
def how_it_works():
    return render_template('how-it-works.html')

@app.route('/generate-plan', methods=['GET', "POST"])
def generate_plan():
    if request.method == "POST":
        if 'file' not in request.files:
            print('No file uploaded!')
        else:
          file = request.files['file']
          file.save(filepath)
          print("File saved:", filepath)
        print("File: ",file)
        prompt_data = request.form.get("prompt")
        generate_plan = GenerateStudyPlan()
        result = generate_plan.invoke({'question' : prompt_data})
        result['result'] = markdown.markdown(result['result'])
        print(result)
        return render_template("generate-plan.html", result=result)

    return render_template("generate-plan.html")


from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

if __name__ == "__main__":
    app.run()
# Requires index.html template to be placed into templates/index.html to work


# Url: To input textbook url. Recognized as a .pdf format. Example url: https://www.mrbigler.com/downloads/Notes-Physics-1.pdf
# Prompt: To input question. Recognized as a string format. Example prompt: What is momentum?

# Response for now is given as a JSON response with question being your prompt and response being
# the tutor's answer.

# TO USE, PRESS THE LINK DOWN IN THE OUTPUT.
# please press CANCEL to the prompt it asks about restarting runtime to use new packages


Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting openai==1.57.0
  Obtaining dependency information for openai==1.57.0 from https://files.pythonhosted.org/packages/ab/2d/eb8539a2d5809eb78508633a8faa8df7745960e99af0388310c43b2c0be1/openai-1.57.0-py3-none-any.whl.metadata
  Using cached openai-1.57.0-py3-none-any.whl.metadata (24 kB)
Collecting anyio<5,>=3.5.0 (from openai==1.57.0)
  Obtaining dependency information for anyio<5,>=3.5.0 from https://files.pythonhosted.org/packages/a0/7a/4daaf3b6c08ad7ceffea4634ec206faeff697526421c20f07628c7372156/anyio-4.7.0-py3-none-any.whl.metadata
  Using cached anyio-4.7.0-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai==1.57.0)
  Obtaining dependency information for distro<2,>=1.7.0 from https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl.metadata
  Using cached distro-1.9.0-py3-none-any.whl.metadat

Please enter Open AI KEY
··········
https://e24y6siheu9-496ff2e9c6d22116-5000-colab.googleusercontent.com/
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:36:01] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:36:01] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:41:55] "GET /how-it-works?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:41:56] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:42:14] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:42:15] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:42:31] "GET /?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:42:37] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:43:36] "GET /generate-plan?authuser=0 HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:43:37]

File saved: ./tutor_textbook.pdf
File:  <FileStorage: 'Tipler_Llewellyn.pdf' ('application/pdf')>


  context_from_db = self.vectorstore_retriever.get_relevant_documents("")
INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:46:37] "POST /generate-plan?authuser=0 HTTP/1.1" 200 -


{'query': 'I need help on Kinematics 1d', 'context': 'Chat history:\n\n\nContext from the document:\nThis page intentionally left blank\nThis page intentionally left blank\nThis page intentionally left blank\nProblems 141\nNotes\n1. Democritus (about 470 B.C. to about 380 B.C.). Among his\nother modern-sounding ideas were the suggestions that the\nMilky Way is a vast conglomeration of stars and that the\nMoon, like Earth, has mountains and valleys.\n2. G. J. Stoney (1826–1911). An Irish physicist who first\ncalled the fundamental unit of charge the electron. After\nThomson discovered the particle that carried the charge, the\nname was transferred from the quantity of charge to the parti-\ncle itself by Lorentz.\n3. Joseph J. Thomson (1856–1940). English physicist and\ndirector for more than 30 years of the Cavendish Laboratory,\nthe first laboratory in the world established expressly for re-\nsearch in physics. He was awarded the Nobel Prize in 1906 for\nhis work on the electron. Seven

INFO:werkzeug:127.0.0.1 - - [05/Jan/2025 03:46:38] "[33mGET /favicon.ico?authuser=0 HTTP/1.1[0m" 404 -


In [None]:
from google.colab import drive
drive.mount('/content/drive')