# Data Wrangling

In [None]:
# Using SDK targeting 2024-02-29-preview or 2023-10-31-preview, make sure your resource is in one of these regions: East US, West US2, West Europe
!pip install azure-ai-documentintelligence==1.0.0b1
!pip install langchain langchain-community azure-ai-documentintelligence

In [None]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from google.colab import userdata
import os

endpoint = "https://yozu-doc-intel.cognitiveservices.azure.com/"
key = str(userdata.get("DOC_INT_API_KEY"))

from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

chunked_data = []

pdf_folder = "/content/splitpdf"
filelist = os.listdir(pdf_folder)
for i in range(len(filelist)):
  filename = filelist[i]
  print("Filename:", filename)
  if filename.endswith(".pdf"):  # Check for PDF extension
    pdf_path = os.path.join(pdf_folder, filename)
    # Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
    loader = AzureAIDocumentIntelligenceLoader(file_path=pdf_path, api_key = key, api_endpoint = endpoint, api_model="prebuilt-layout")
    docs = loader.load()

    # Split the document into chunks base on markdown headers.
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    docs_string = docs[0].page_content
    splits = text_splitter.split_text(docs_string)
    if len(splits) == 0:
      continue
    chunked_data.append(splits[0])

chunked_data

Filename: wren_and_martin_simplified_textbook-19.pdf
Filename: wren_and_martin_simplified_textbook-21.pdf
Filename: wren_and_martin_simplified_textbook-20.pdf
Filename: wren_and_martin_simplified_textbook-6.pdf
Filename: wren_and_martin_simplified_textbook-39.pdf
Filename: wren_and_martin_simplified_textbook-13.pdf
Filename: wren_and_martin_simplified_textbook-4.pdf
Filename: wren_and_martin_simplified_textbook-33.pdf
Filename: wren_and_martin_simplified_textbook-18.pdf
Filename: wren_and_martin_simplified_textbook-2.pdf
Filename: wren_and_martin_simplified_textbook-28.pdf
Filename: wren_and_martin_simplified_textbook-9.pdf
Filename: wren_and_martin_simplified_textbook-48.pdf
Filename: wren_and_martin_simplified_textbook-50.pdf
Filename: wren_and_martin_simplified_textbook-7.pdf
Filename: wren_and_martin_simplified_textbook-37.pdf
Filename: wren_and_martin_simplified_textbook-45.pdf
Filename: .ipynb_checkpoints
Filename: wren_and_martin_simplified_textbook-31.pdf
Filename: wren_and_mar

[Document(page_content='caption, rascal, patriot, glutton. Page 8  \nCHAPTER 6\n===  \nTHE NOUN: GENDER  \n25\\. You know that living beings are of either the male or the female sex. Now compare the words in the following pairs: Boy (Lion, Hero, Cock-sparrow) Girl (Lioness, Heroine, Hen-sparrow)  \nWhat do you notice? The first word of each pair is the name of a male animal. The second word of each pair is the name of a female animal. A noun that denotes a male animal is said to be of the Masculine Gender. [Gender comes from Latin genus, kind or sort.] A noun that denotes a female animal is said to be of the Feminine Gender.  \n26\\. A noun that denotes either a male or a female is said to be of the Common Gender; as Parent, child, friend, pupil, servant, thief, relation, enemy, cousin, person, orphan, student, baby, monarch, neighbour, infant.  \n27\\. A noun that denotes a thing that is neither male nor female (i.e., thing without life) is said to be of the Neuter Gender; as, Book, p

In [None]:
# len(chunked_data)
# chunks = []
# for c in chunked_data:
#   for sc in c:
#     chunks.append(sc)
# len(chunks)

150

# Langchain

In [None]:
%pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken
%pip install azure-storage-blob

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.1/289.1 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.7/113.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
from google.colab import userdata
from operator import itemgetter
import os, pickle
from pprint import pprint
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda , RunnablePassthrough
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.llms import Ollama
os.environ["OPENAI_API_KEY"] = str(userdata.get("test_API_KEY"))
chat_history = []

In [None]:
vectorstore = FAISS.from_documents(
    chunked_data, embedding=OpenAIEmbeddings()
)
vectorstore.save_local("dp_pbl_vectorstore")

In [None]:
# from azure.storage.blob import BlobServiceClient

# CONN_STR = str(userdata.get("AZURE_STORAGE_CONNECTION_STRING"))

# blob_service_client = BlobServiceClient.from_connection_string(conn_str=CONN_STR)
# container_client = blob_service_client.get_container_client("ncert-extraction-storage")
# print("\nListing blobs...")

# # List the blobs in the container
# blob_list = container_client.list_blobs()
# for blob in blob_list:
#     print("\t" + blob.name)

# try:
#     # Create the local vectorstore folder (if it doesn't exist)
#     os.makedirs("vectorstore", exist_ok=True)

#     for blob in container_client.list_blobs(prefix="NCERT_IX_C1/vectorstore/"):
#         # Extract the filename from the blob name (assuming simple structure)
#         filename = blob.name.split("/")[-1]
#         local_path = os.path.join("vectorstore", filename)  # Path within "vectorstore" folder

#         blob_client = container_client.get_blob_client(blob.name)
#         with open(local_path, "wb") as download_file:
#             download_file.write(blob_client.download_blob().readall())
# except Exception as ex:
#   # Handle download exceptions
#   print(f"Error downloading vectorstore files: {ex}")

In [None]:
vectorstore = FAISS.load_local("dp_pbl_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)

retriever = vectorstore.as_retriever()

human_template = """Context: {context}
Question: {question}
"""
system_prompt = """
As a helpful and cheerful assistant dedicated to teaching grammar to students, your role is to strictly answer grammar-related queries. If a student makes a grammatical error in their query, you will correct it and provide an explanation of what is incorrect.
"""

system_message_prompt = SystemMessagePromptTemplate.from_template(system_prompt)
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages(
   [system_message_prompt, human_message_prompt]
)

pprint(chat_prompt)

model = ChatOpenAI()

# Chain
chain = (
   {
       "context": itemgetter("question") | retriever,  # Context from retriever
       "question": itemgetter("question"),
   }
   | chat_prompt
   | model
   | StrOutputParser()
)

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='\nAs a helpful and cheerful assistant dedicated to teaching grammar to students, your role is to strictly answer grammar-related queries. If a student makes a grammatical error in their query, you will correct it and provide an explanation of what is incorrect.\n')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Context: {context}\nQuestion: {question}\n'))])


In [None]:
vectorstore.similarity_search("Explain 2nd question in fill the blank spaces with nearest or next")

[Document(page_content="(c) Fill the blank spaces with 'oldest' and 'eldest' :-  \n1\\. Rustam is the --- of my uncle's five sons.  \n2\\. He is the --- member of the School Committee.  \n3\\. That is Antonio, the duke's --- son.  \n4\\. The --- mosque in the town is near the railway station.  \n5\\. Mr. Smith is the --- teacher in the school.  \n(d) Fill the blank spaces with 'farther' or 'further' :-  \n1\\. I can't walk any ---.  \n2\\. No --- reasons were given.  \n3\\. He walked off without --- ceremony.  \n4\\. Until --- orders Mr. K.S. Dave will act as Headmaster of Nira High School. 5. To let, a bungalow at Ridge Road. For --- particulars apply to Box. No. 65. Page 29  \n(e) Fill the blank spaces with 'latest' or 'last' :-  \n1\\. The --- news from China is very disquieting.  \n2\\. The --- time I saw him, he was in high spirits.  \n3\\. To-day is the --- day for receiving lenders.  \n4\\. We expect to get the --- news in a few hours.  \n5\\. The --- Moghul Emperor came to an i

In [None]:
query = "Explain 2nd question in fill the blank spaces with nearest or next and give the answer" # @param {type:"string"}
output = chain.invoke({"question": str(query)})
print(output)

The correction in your question is: "Explain the 2nd question in 'Fill the blank spaces with 'nearest' or 'next' and provide the answer."

Explanation: The ordinal number "2nd" should be written as "2nd" with the letters "nd" following the number "2" to indicate its position in a sequence.

Answer: The 2nd question in the 'Fill the blank spaces with 'nearest' or 'next' is "The pillar-box is --- to my house." The correct word to fill in the blank is "nearest."


In [None]:
pprint(chat_history)

[HumanMessage(content='Can you explain the topic states of matter ?'),
 AIMessage(content="Of course! Let's talk about states of matter. Matter can exist in three main states: solid, liquid, and gas. \n\nSolid: In a solid, the particles are packed tightly together and vibrate in place. Think of a solid like a group of people standing shoulder to shoulder in a line and barely moving.\n\nLiquid: In a liquid, the particles are close together but can move around each other. Imagine a liquid like a group of people mingling and moving around at a party.\n\nGas: In a gas, the particles are far apart and move freely. Picture a gas like a group of people spreading out and freely moving around in a large open space.\n\nDo you have any examples of solids, liquids, and gases that you can think of in your daily life? Let's discuss further to see if you can identify more examples!")]


# Ollama

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh
!pip install ollama

>>> Downloading ollama...
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Collecting ollama
  Downloading ollama-0.1.8-py3-none-any.whl (9.4 kB)
Installing collected packages: ollama
Successfully installed ollama-0.1.8


ConnectError: [Errno 111] Connection refused

In [None]:
!ollama serve &

time=2024-04-11T17:29:55.146Z level=INFO source=images.go:804 msg="total blobs: 0"
time=2024-04-11T17:29:55.147Z level=INFO source=images.go:811 msg="total unused blobs removed: 0"
time=2024-04-11T17:29:55.147Z level=INFO source=routes.go:1118 msg="Listening on 127.0.0.1:11434 (version 0.1.31)"
time=2024-04-11T17:29:55.148Z level=INFO source=payload_common.go:113 msg="Extracting dynamic libraries to /tmp/ollama2035018923/runners ..."
time=2024-04-11T17:30:01.531Z level=INFO source=payload_common.go:140 msg="Dynamic LLM libraries [cpu_avx2 rocm_v60000 cuda_v11 cpu cpu_avx]"
time=2024-04-11T17:30:01.531Z level=INFO source=gpu.go:115 msg="Detecting GPU type"
time=2024-04-11T17:30:01.531Z level=INFO source=gpu.go:265 msg="Searching for GPU management library libcudart.so*"
time=2024-04-11T17:30:01.536Z level=INFO source=gpu.go:311 msg="Discovered GPU libraries: [/tmp/ollama2035018923/runners/cuda_v11/libcudart.so.11.0 /usr/local/cuda/lib64/libcudart.so.12.2.140]"
time=2024-04-11T17:30:01.5

In [None]:
import ollama
ollama.pull("phi")