In [1]:
!pip install langchain
!pip install huggingface_hub

# Pip installation of additional needed libraries
!pip install sentence_transformers
!pip install faiss-cpu
!pip install unstructured

Collecting langchain
  Downloading langchain-0.0.348-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-core<0.1,>=0.0.12 (from langchain)
  Downloading langchain_core-0.0.12-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.69-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading m

In [1]:
import os
from langchain.document_loaders import UnstructuredPDFLoader


In [2]:
# Read PDF
def loadPDFFromLocal(pdf_file_path="/content/Building+Continuous+Learning+Infrastructure.pdf"):
    loader = UnstructuredPDFLoader(pdf_file_path)
    loaded_docs = loader.load()
    return loaded_docs

In [3]:
# Split doc into chunks
from langchain.text_splitter import CharacterTextSplitter
def splitDocument(loaded_docs):
    # Splitting documents into chunks
    splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    chunked_docs = splitter.split_documents(loaded_docs)
    return chunked_docs

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [5]:
def createEmbeddings(chunked_docs):
    # Create embeddings and store them in a FAISS vector store
    embedder = HuggingFaceEmbeddings()
    vector_store = FAISS.from_documents(chunked_docs, embedder)
    return vector_store

In [15]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'declare-lab/flan-alpaca-large'# go for a smaller model if you dont have the VRAM
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

local_llm = HuggingFacePipeline(pipeline=pipe)


tokenizer_config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [16]:
print(local_llm('What is the capital of France? '))


The capital of France is Paris.


In [17]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
def loadLLMModel():
    #llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})

    chain = load_qa_chain(local_llm, chain_type="stuff")
    return chain

def askQuestions(vector_store, chain, question):
    # Ask a question using the QA chain
    similar_docs = vector_store.similarity_search(question)
    response = chain.run(input_documents=similar_docs, question=question)
    return response

In [18]:
chain = loadLLMModel()

In [19]:
PDF_loaded_docs = loadPDFFromLocal()

In [20]:
PDF_chunked_docs = splitDocument(PDF_loaded_docs)



In [21]:
PDF_vector_store = createEmbeddings(PDF_chunked_docs)

In [None]:
#testing

In [22]:
PDF_response = askQuestions(PDF_vector_store, chain, "what are the two pipelunes in two ml_category?")
print(PDF_response)

The model building pipeline and inference pipeline are the two pipelines present in the ml_pipeline category.


In [23]:
PDF_response = askQuestions(PDF_vector_store, chain, "what is the purpose of streamlit?")
print(PDF_response)

The purpose of streamlit is to build web apps for Machine Learning and Data Science tasks.


In [24]:
PDF_response = askQuestions(PDF_vector_store, chain, "on what port mlflow servermshould be run?")
print(PDF_response)

Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors


The mlflow server should be run on the following port: 6008.


In [27]:
PDF_response = askQuestions(PDF_vector_store, chain, "what happens if drift level between 10 and 20 ?")
print(PDF_response)

If the level of the drift is between 20 and 30 (secondary and tertiary thresholds), the model's performance will worsen further. However, it is still not advisable to throw away the deployed model. Rather than training a new model, you can tune the model on the new data to learn the data distribution better.


In [20]:
pip install pikepdf

Collecting pikepdf
  Downloading pikepdf-8.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Pillow>=10.0.1 (from pikepdf)
  Downloading Pillow-10.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Deprecated (from pikepdf)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: Pillow, Deprecated, pikepdf
  Attempting uninstall: Pillow
    Found existing installation: Pillow 9.4.0
    Uninstalling Pillow-9.4.0:
      Successfully uninstalled Pillow-9.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imageio 2.31.6 requires pillo

In [11]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [17]:
!pip install pypdf



Collecting pypdf
  Downloading pypdf-3.17.2-py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.9/277.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.17.2


In [19]:
pip install pdfminer

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140061 sha256=042ce4a8ebf0dcba519c991f66c3e0e65f9079791cc485675b87a6687b8d799b
  Stored in directory: /root/.cache/pip/wheels/4e/c1/68/f7bd0a8f514661f76b5cbe3b5f76e0033d79f1296012cbbf72
Successfully built pdfminer
Installing collected packages: pycryptodome, pdfminer
Successfully installed pdfmine

In [21]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20221105


In [17]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [18]:
pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.3


In [10]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [7]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.3.post1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.3.post1


In [11]:
pip install unstructured




In [13]:
!pip install "unstructured[all-docs]"

Collecting python-docx>=1.1.0 (from unstructured[all-docs])
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting onnx (from unstructured[all-docs])
  Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting msg-parser (from unstructured[all-docs])
  Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting unstructured-inference==0.7.15 (from unstructured[all-docs])
  Downloading unstructured_inference-0.7.15-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
C