In [6]:
pip install -r ./requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip show langchain

Name: langchain
Version: 0.3.19
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: C:\Users\sivak\AppData\Local\Programs\Python\Python313\Lib\site-packages
Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: jupyter_ai_magics, langchain-community
Note: you may need to restart the kernel to use updated packages.


### Python.env

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

# loading the API Keys from .env
load_dotenv(find_dotenv(), override=True)

os.environ.get('OPENAI_API_KEY')

'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'

In [9]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data

In [19]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [13]:
data = load_document('files/us_constitution.pdf')
#print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1173 characters in the page


In [26]:
#data = load_document('files/the_great_gatsby.docx')
#print(data[0].page_content)

In [21]:
pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (pyproject.toml): started
  Building wheel for wikipedia (pyproject.toml): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11785 sha256=da4d21508b85b78ce31cee51ca50d43ee2909c52b696f4a1bc95b918fea40fb8
  Stored in directory: c:\users\sivak\appdata\local\pip\cache\wheels\79\1d\c8\b64e19423cc5a2a339450ea5d145e7c8eb3d4aa2b150cde33b
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Note: you may need to rest

In [23]:
#data = load_from_wikipedia('GPT-4', 'de')
#print(data[0].page_content)

OpenAI, Inc. ist ein US-amerikanisches nicht-börsennotiertes Softwareunternehmen, das sich seit Ende 2015 mit der Erforschung von künstlicher Intelligenz (KI, englisch Artificial Intelligence, AI) beschäftigt. Anfänglich war das Ziel von OpenAI, künstliche Intelligenz auf Open-Source-Basis zu entwickeln. Das Unternehmen wurde vorerst als Non-Profit geführt. 2019 wurde die gewinnorientierte Tochtergesellschaft OpenAI Global, LLC gegründet, in der Microsoft größter Investor ist. OpenAI ist vor allem bekannt für die Entwicklung der generativen vortrainierten Transformer (GPT) – auch generative künstliche Intelligenz, kurz GenAI, bezeichnet – und der daraus abgeleiteten Softwareprodukte wie ChatGPT oder DALL-E.


== Geschichte ==


=== Gründungsphase und Mission ===
Der Gründung von OpenAI im Jahr 2015 ging bereits eine lange Debatte um die Risiken von KI voraus. Die Wissenschaftler Stephen Hawking und Stuart Jonathan Russell etwa hatten Befürchtungen geäußert, wenn künstliche Intelligenz 

### Chunking Data

In [14]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [15]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

In [16]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

224
Maryland six, V irginia ten, North Carolina five, South Carolina five, and 
 Georgia three. 
 When vacancies happen in the Representation from any State, the 
 Executive Authority thereof shall issue W rits of Election to fill such 
 V acancies.


In [17]:
print_embedding_cost(chunks)

Total Tokens: 9842
Embedding Cost in USD: 0.000197


### Embedding and Uploading to a Vector Database (Pinecone)

In [35]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [55]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec
    from langchain_community.embeddings import HuggingFaceEmbeddings

  

    pc = pinecone.Pinecone()

   
    #embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
        )


    
    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=384,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
        ) 
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

In [56]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')

In [57]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [58]:
pip install -U langchain-huggingface

Note: you may need to restart the kernel to use updated packages.


In [59]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index askadocument and embeddings ...Ok


### Asking and Getting Answers

In [60]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer

In [61]:
q = 'What about The House of Representatives.translate in japanees'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What about The House of Representatives.translate in japanees', 'result': 'アメリカ合衆国の下院は、各州の人々によって毎2年選ばれる議員で構成され、各州の選挙人は所定の資格を持たなければならない。'}


### While Loop for Asking Questions

In [58]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')


Write Quit or Exit to quit.


Question #1:  what is bill of rights



Answer: {'query': 'what is bill of rights', 'result': 'The Bill of Rights refers to the first ten amendments to the United States Constitution. These amendments were added to the Constitution in 1791 to guarantee specific rights and freedoms to the American people. The Bill of Rights includes protections such as freedom of speech, religion, and the right to bear arms.'}

 -------------------------------------------------- 



Question #2:  quit


Quitting ... bye bye!


In [1]:
pip install sentence-transformers PyMuPDF

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata

In [2]:
from sentence_transformers import SentenceTransformer

# Load a pretrained model (runs locally)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Your input text
text = "This is a sample string for embedding generation."

# Generate embedding
embedding = model.encode(text)

print(embedding)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[-4.71430458e-02 -5.52545255e-03 -3.50545384e-02  1.34919854e-02
 -2.02754978e-02  4.12690230e-02  5.93573460e-03  8.99987947e-03
  2.70032119e-02 -8.49726573e-02  3.37921567e-02 -3.81181948e-02
  6.67548999e-02 -3.14207785e-02 -5.71431667e-02  1.09699033e-01
  2.12655198e-02 -5.17467689e-03 -2.01261714e-02 -4.84689474e-02
  3.71999070e-02  9.74084958e-02  3.36972326e-02 -4.92565669e-02
  7.60503113e-02 -6.17578346e-03  2.84094433e-03  7.67582655e-02
  1.26439214e-01  1.71363894e-02  9.30175111e-02 -3.12801194e-03
  2.73978692e-02  5.27505614e-02  3.52037437e-02  1.17751203e-01
 -5.30984141e-02  4.36090268e-02  3.19945626e-02  3.59284095e-02
  3.78550999e-02 -2.32399325e-03  1.60563160e-02  5.77172339e-02
  5.11511648e-03 -2.31456235e-02 -9.88405496e-02 -3.86239700e-02
 -4.54520881e-02  2.64439620e-02 -1.20270336e-02 -5.66426665e-02
 -1.04753889e-01 -3.74758318e-02  3.76589522e-02 -4.65062505e-04
 -6.97305251e-04 -3.97367738e-02  4.59246896e-03 -3.15984571e-03
  4.92155459e-03 -2.36496

In [3]:
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load and read PDF
doc = fitz.open("files/us_constitution.pdf")
text = ""
for page in doc:
    text += page.get_text()

# Generate embedding for full document or chunk it if it's large
embedding = model.encode(text)

print(embedding)

[-3.14292274e-02 -2.26832740e-03  3.27335261e-02 -2.54597273e-02
 -7.75039420e-02  5.42805605e-02  4.02637273e-02 -2.09278613e-02
 -8.66073519e-02  7.82385468e-03 -6.56946898e-02  5.08351214e-02
  6.88601434e-02 -5.46957068e-02 -1.64370090e-02  6.85545579e-02
 -1.04315672e-02  3.70913595e-02 -5.30714691e-02  7.31654689e-02
  1.49925455e-01 -9.83878225e-03 -5.14669642e-02  2.88925953e-02
 -2.07863245e-02 -3.26149650e-02 -2.63008289e-02 -4.53738086e-02
  2.16202829e-02 -4.33639176e-02  1.81762986e-02 -9.68580544e-02
  2.52090264e-02  7.61997253e-02 -3.96816730e-02 -6.36394247e-02
  1.33098572e-01 -4.23294902e-02  6.40848046e-03 -3.17990384e-03
 -3.47406231e-02 -3.92987952e-03 -3.68944518e-02  9.05972868e-02
 -7.75156245e-02  8.19743127e-02  1.34076290e-02  2.47113612e-02
 -3.93300354e-02  1.98954323e-04 -3.64696160e-02  9.05646533e-02
 -2.27361526e-02  7.51272216e-02  4.87683713e-02 -1.74987204e-02
 -5.83200119e-02 -1.08447194e-01 -3.90982889e-02 -3.82477194e-02
 -4.15455550e-02 -5.48755