<a href="https://colab.research.google.com/github/ghvijayakumar/devops-workshop/blob/main/RAG_book.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install Pillow
%pip install PyPDF2
%pip install openai
%pip install langchain
%pip install faiss-cpu
%pip install tiktoken

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting openai
  Downloading openai-1.3.0-py3-none-any.whl (220 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.3/220.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore->httpx<1,>=0.23.0->openai)
  Downloading h11-0

In [None]:
import os

import PyPDF2
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from openai import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.chains.retrieval_qa.base import RetrievalQA

In [None]:
api_key = "a34a353bd97c49f2917b7eb8d824e7ef"
api_base = "https://openapiazconf.openai.azure.com/"
api_type = 'azure'
api_version = '2023-10-01-preview'
embedding_model_name = 'text-embedding-ada-002'
embedding_deployment_name='embedding'
deployment_name='azconfmodel'

In [None]:
client = AzureOpenAI(
    api_key=api_key,
    api_version=api_version,
    azure_endpoint = api_base
)

In [None]:
def extract_data_from_scanned_pdf(file_path: str) -> str:
    data = ""
    reader = PyPDF2.PdfReader(file_path)
    for i in range(len(reader.pages)):
        data += reader.pages[i].extract_text()

    return data

In [None]:
def get_data_chunks(data: str, chunk_size: int):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, separator="\n", length_function=len)
    chunks = text_splitter.split_text(data)
    return chunks

In [None]:
file_path = "/sample.pdf";

In [None]:
print(file_path)
raw_data = extract_data_from_scanned_pdf(file_path)

/sample.pdf


In [None]:
chunk_size=16
chunks = get_data_chunks(raw_data, chunk_size=chunk_size)  # create text chunks



In [None]:
embeddings = AzureOpenAIEmbeddings(model=embedding_deployment_name,
                        openai_api_base=api_base,
                        api_key=api_key,
                        openai_api_type=api_type)




In [None]:
test_chunks = chunks[25:35]
print(test_chunks)

['21. WHERE TO SEND MATERIALS NOT SUBMITTED ELECTRONICALLY', '22. TECHNICAL PROBLEMS WITH THE APPLICATION', '1. APPLICATION DEADLINE', 'Your application and all supporting documents must be received by the Harvard Law School Graduate Program office no later', 'than 11:59 p.m. U.S. Eastern time on December 1 (and preferably by November 15).', 'Please note that the application deadline of December 1 will be strictly observed. It is your responsibility to make', 'certain that all supporting materials (e.g., un official transcript, recommendations, TOEFL report) reach the Graduate', 'Program office by the deadline.', 'Please do not call or email us about the possibility of submitting materials after the deadline. If you have completed and', 'submitted your online application by the deadline, we will not disqualify your application if some of your supporting documents (e.g., unofficial transcript, recommendations , TOEFL report) have not been received by the deadline. However, we will begin

In [None]:
knowledge_hub = FAISS.from_texts(test_chunks, embeddings)



In [None]:
retriever = knowledge_hub.as_retriever(
        search_type="similarity", search_kwargs={"k": 2}
)

In [None]:
llm = AzureChatOpenAI(
            deployment_name=deployment_name,
            openai_api_base=api_base,
            openai_api_key=api_key,
            openai_api_type=api_type,
            openai_api_version=api_version,
            temperature=0.3)



In [None]:
chain_type= 'stuff'
chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
    )

In [None]:

question='all supporting materials'
result = chain({"query": question})
print(result)



{'query': 'all supporting materials', 'result': 'Based on the given context, it is not clear where to send all supporting materials that were not submitted electronically. It is recommended to check the specific instructions provided by the Graduate program or to contact them directly for clarification.', 'source_documents': [Document(page_content='certain that all supporting materials (e.g., un official transcript, recommendations, TOEFL report) reach the Graduate'), Document(page_content='21. WHERE TO SEND MATERIALS NOT SUBMITTED ELECTRONICALLY')]}
