langchain models and 2 get endpoints for the title and answer

selaudin · Oct 29, 2023 · f0181ed · f0181ed
1 parent 30f6f3f
commit f0181ed
Show file tree

Hide file tree

Showing 19 changed files with 236 additions and 17 deletions.
diff --git a/llm_api/app/documents/__pycache__/urls.cpython-311.pyc b/llm_api/app/documents/__pycache__/urls.cpython-311.pyc
diff --git a/llm_api/app/documents/__pycache__/views.cpython-311.pyc b/llm_api/app/documents/__pycache__/views.cpython-311.pyc
diff --git a/llm_api/app/documents/docs/example.pdf b/llm_api/app/documents/docs/example.pdf
diff --git a/llm_api/app/documents/docs/example1.pdf b/llm_api/app/documents/docs/example1.pdf
diff --git a/llm_api/app/documents/docs/views.py b/llm_api/app/documents/docs/views.py
@@ -0,0 +1,56 @@
+from django.shortcuts import render
+from rest_framework.response import Response
+from rest_framework.decorators import api_view
+from neomodel import db
+import requests
+import time
+
+# Create your views here.
+@api_view(['GET'])
+def getData(request):
+    # user = db.cypher_query(
+    # '''
+    # MATCH (n:User)
+    # RETURN n
+    # '''
+    # )[0]
+    # print(user)
+    # # return Response()
+
+    # Fatjon logic to get data and store in graph
+    # get data from API for different categories
+    categories =  ["SAP", "ServiceNow", "General", "MDE", "Guideline"]
+    auth = ("BaselHack2023", "Tc13cspLs!eAve")
+
+
+    for search_query in categories:
+
+        # Define the API URL with the search query as a parameter
+        api_url = f'https://bfgtest.service-now.com/api/besag/search_knowledge_baselhack2023/searchkb/{search_query}'
+        # print(api_url)
+
+        # Make a GET request to the API
+        response = requests.get(api_url, auth=auth)
+        # print(response)
+
+        # Check if the request was successful (status code 200)
+        if response.status_code == 200:
+            data = response.json()
+            # print(data["result"][0])
+            # Store data in the db
+            for result in data["result"]:
+                db.cypher_query(
+                    '''
+                    MERGE (d:Document{id: $result['number']})
+                    SET d += $result, d.context = 'tbd', d.tags = 'tbd'
+                    ''', result
+                )
+        else:
+            # Handle the case where the API request fails
+            print("API call failed!")
+
+        # Sleep for the specified duration
+        time.sleep(7)
+
+
+
diff --git a/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/data_level0.bin b/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/data_level0.bin
diff --git a/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/header.bin b/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/header.bin
diff --git a/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/length.bin b/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/length.bin
diff --git a/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/link_lists.bin b/llm_api/app/documents/mydata/4113db52-1d4a-415d-bd03-fef154bdf0e1/link_lists.bin
diff --git a/llm_api/app/documents/mydata/chroma.sqlite3 b/llm_api/app/documents/mydata/chroma.sqlite3
diff --git a/llm_api/app/documents/urls.py b/llm_api/app/documents/urls.py
@@ -2,6 +2,6 @@
 from . import views
 
 urlpatterns = [
-    path('', views.getData),
+    path('', views.load_data),
     # path('post/', views.postData),
 ]
diff --git a/llm_api/app/documents/views.py b/llm_api/app/documents/views.py
@@ -2,16 +2,141 @@
 from rest_framework.response import Response
 from rest_framework.decorators import api_view
 from neomodel import db
+from langchain.document_loaders import PyPDFLoader
+from langchain.chains.question_answering import load_qa_chain
+from langchain.llms import OpenAI
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+
+
+from bs4 import BeautifulSoup
+
+
+from langchain.vectorstores import Chroma
+
+
+
+from django.conf import settings
+import os
+import requests
+
+
 
 
 # Create your views here.
 @api_view(['GET'])
-def getData(request):
-    user = db.cypher_query(
-    '''
-    MATCH (n:User)
-    RETURN n
-    '''
-    )[0]
-    print(user)
-    return Response()
+def load_data(request):
+    vector_db = vectorize_the_context()
+    # get_answer(vector_db)
+    # answer_promt()
+    return Response()
+
+
+
+def get_context_tag(text):
+    from langchain.text_splitter import CharacterTextSplitter
+    text_splitter = CharacterTextSplitter(
+    separator="\n",
+    chunk_size=1000,
+    chunk_overlap=150,
+    length_function=len
+    )
+
+    docs = text_splitter.split_documents(text)
+
+
+def vectorize_the_context():
+    openai_key = "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"
+    from langchain.document_loaders import PyPDFLoader
+    directory = os.path.join(settings.BASE_DIR, "documents/docs/")
+    # print([os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')])
+    loaders = [PyPDFLoader(os.path.join(directory, f)) for f in os.listdir(directory) if f.endswith('.pdf')]
+
+    # # Load PDF
+    docs = []
+    for loader in loaders:
+        docs.extend(loader.load())
+
+    # Define the Text Splitter 
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size = 1500,
+        chunk_overlap = 150
+    )
+
+    # #Create a split of the document using the text splitter
+    splits = text_splitter.split_documents(docs)
+
+    from langchain.vectorstores import Chroma
+    from langchain.embeddings.openai import OpenAIEmbeddings
+
+    embedding = OpenAIEmbeddings(openai_api_key=openai_key)
+
+
+    persist_directory = os.path.join(settings.BASE_DIR, "mydata")
+
+
+    # Create the vector store
+    vectordb = Chroma.from_documents(
+        documents=splits,
+        embedding=embedding,
+        persist_directory=persist_directory
+    )
+    return vectordb
+
+
+def get_answer(vectordb):
+    def pretty_print_docs(docs):
+        print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))
+    openai_key = "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"
+
+    question = "is there an email i can ask for help"
+
+    docs = vectordb.similarity_search(question,k=5)
+
+    # Check the length of the document
+    print(docs)
+
+    # Check the content of the first document
+    pretty_print_docs(docs)
+
+    # Persist the database to use it later
+    vectordb.persist()
+
+    from langchain.llms import OpenAI
+    from langchain.retrievers.self_query.base import SelfQueryRetriever
+    from langchain.chains.query_constructor.base import AttributeInfo
+
+
+    metadata_field_info = [
+    AttributeInfo(
+        name="source",
+
+        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",
+        type="string",
+    ),
+    AttributeInfo(
+        name="page",
+        description="The page from the lecture",
+        type="integer",
+    ),
+    ]
+    document_content_description = "Lecture notes"   
+    llm = OpenAI(temperature=0, openai_api_key=openai_key)
+    retriever = SelfQueryRetriever.from_llm(
+        llm,
+        vectordb,
+        document_content_description,
+        metadata_field_info,
+        verbose=True
+    )
+    question = "what did they say about regression in the third lecture?"
+    docs = retriever.get_relevant_documents(question)
+    pretty_print_docs(docs)
+
+
diff --git a/llm_api/app/mydata/chroma.sqlite3 b/llm_api/app/mydata/chroma.sqlite3
diff --git a/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/data_level0.bin b/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/data_level0.bin
diff --git a/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/header.bin b/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/header.bin
diff --git a/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/index_metadata.pickle b/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/index_metadata.pickle
diff --git a/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/length.bin b/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/length.bin
diff --git a/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/link_lists.bin b/llm_api/app/mydata/e29d41b1-7cb0-442e-81d6-b599157c716a/link_lists.bin
diff --git a/llm_api/app/querys/views.py b/llm_api/app/querys/views.py
@@ -4,20 +4,58 @@
 from neomodel import db
 
 
-from langchain.llms import OpenAI
 from langchain.chat_models import ChatOpenAI
 from django.conf import settings
+from langchain.document_loaders import PyPDFLoader
+import os 
+
 
 @api_view(['GET'])
 def get_title_(request):
     # openai_key = settings.openai_key
-    openai_key = ""
-    print(openai_key)
+    from langchain.llms import OpenAI    
+    openai_key = "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"
 
     llm = OpenAI(openai_api_key=openai_key)
-    # question = request.query_params.get('question')
-    question = "Can you find the account details for the user with the name 'John Smith'?"
+    question = request.query_params.get('question')
+
+    # question = "Can you find the account details for the user with the name 'John Smith'?"
     title = llm.predict(f"can you write a 10 titles for this question: '{question}'")
 
-    print(title)    
-    return Response({'title': title})
+    return Response({'title': title})
+
+@api_view(['GET'])
+def answer_promt(request):
+    openai_key = "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"
+
+    # Load vector database that was persisted earlier and check collection count in it
+    from langchain.vectorstores import Chroma
+    from langchain.embeddings.openai import OpenAIEmbeddings
+    from langchain.chains import RetrievalQA
+    from langchain.prompts import PromptTemplate
+    from langchain.chat_models import ChatOpenAI
+    persist_directory = os.path.join(settings.BASE_DIR, "mydata")
+    embedding = OpenAIEmbeddings(openai_api_key=openai_key)
+    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
+    llm = ChatOpenAI(temperature=0, openai_api_key=openai_key)
+
+    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that "you don't know, Can you provide us more hints", don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "Thanks" at the end of the answer. 
+    {context}
+    Question: {question}
+    Helpful Answer:"""
+    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)# Run chain
+    qa_chain = RetrievalQA.from_chain_type(
+        llm,
+        retriever=vectordb.as_retriever(),
+        return_source_documents=True,
+        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
+        )
+    question = request.query_params.get('question')
+    result = qa_chain({"query": question})
+    # Check the result of the query
+    # print(result["result"])
+    # # Check the source document from where we 
+    # print(result["source_documents"][0])
+
+    return Response({'results': result})
+