## Thursday, April 11, 2024

mamba activate langchain3

In [1]:
# Example: reuse your existing OpenAI setup
from openai import OpenAI

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

completion = client.chat.completions.create(
  model="TheBloke/Nous-Hermes-2-SOLAR-10.7B-GGUF/nous-hermes-2-solar-10.7b.Q8_0.gguf",
  messages=[
    {"role": "system", "content": "Always answer in rhymes."},
    {"role": "user", "content": "Introduce yourself."}
  ],
  temperature=0.7,
)

print(completion.choices[0].message)

ChatCompletionMessage(content="Hello there, how do you fare?\nMy name is Rhyme, I'm here to share\nSome fun and laughter, and a smile so rare,\nSo let's chat and have some good air!", role='assistant', function_call=None, tool_calls=None)


Document Loaders

In [1]:
from langchain.document_loaders import TextLoader

In [2]:
loader = TextLoader("../data/vocab.txt")
document = loader.load()

In [5]:
from langchain.document_loaders.csv_loader import CSVLoader

# id|custom_title|stubhub_title|vividseats_title
loader = CSVLoader(file_path='../data/titledata.csv', csv_args={
    'delimiter': '|',
    'quotechar': '"',
    'fieldnames': ['id', 'custom_title', 'stubhub_title', 'vividseats_title']})
document = loader.load()

First time running the below we get the error ...

* ImportError: pypdf package not found, please install it with `pip install pypdf`

In [None]:
# mamba install conda-forge::pypdf

In [7]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../data/Llama Getting Started Guide.pdf")
pages = loader.load_and_split()

First time running the next cell we got the error ...

* ValueError: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.

Signed up, but then realized there is no free version of this, so yeah, bye bye MathPix ... !

In [8]:
from langchain.document_loaders import MathpixPDFLoader

loader = MathpixPDFLoader("../data/Llama Getting Started Guide.pdf")
data = loader.load()

ValueError: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.

First time running the next cell produces the error ...

* ImportError: `pdfminer` package not found, please install it with `pip install pdfminer.six`

In [None]:
# mamba install conda-forge::pdfminer
# mamba install conda-forge::pdfminer.six

In [11]:
from langchain.document_loaders import PDFMinerLoader

loader = PDFMinerLoader("../data/Llama Getting Started Guide.pdf")
data = loader.load()

In [12]:
from langchain.document_loaders import PDFMinerPDFasHTMLLoader

loader = PDFMinerPDFasHTMLLoader("../data/Llama Getting Started Guide.pdf")
data = loader.load()

Document Transformers

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# This is a long document we can split up.
with open('../data/state_of_the_union.txt') as f:
    state_of_the_union = f.read()
    
len(state_of_the_union)

38540

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True,
)

texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' metadata={'start_index': 0}
page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={'start_index': 82}


In [18]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'


In [20]:
# This is a long document we can split up.
with open('../data/index.html') as f:
    html_string = f.read()

First time running the next cell produced the error ...

* ImportError: Unable to import lxml, please install with `pip install lxml`.

In [None]:
# pip install lxml

In [22]:
from langchain.text_splitter import HTMLHeaderTextSplitter

# html_string = "Your HTML content here..."
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
print(html_header_splits[0])

page_content='API Core Experimental Python Docs  \nToggle Menu  \nPrev Up Next  \nLangChain 0.0.339rc1  \nAll modules for which code is available'


In [27]:
from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter

url = "https://example.com"
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)

chunk_size = 500
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
splits = text_splitter.split_documents(html_header_splits)


In [28]:
print(splits[0])

page_content='Example Domain'


In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

python_code = """
def hello_world():
    print("Hello, World!")
hello_world()
"""

In [35]:
# python_splitter = RecursiveCharacterTextSplitter.from_language(
#     language=Language.PYTHON, chunk_size=50
# )

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=200
)

In [36]:
python_docs = python_splitter.create_documents([python_code])
print(python_docs[0])

page_content='def hello_world():\n    print("Hello, World!")\nhello_world()'


In [37]:
js_code = """
function helloWorld() {
  console.log("Hello, World!");
}
helloWorld();
"""

In [38]:
# js_splitter = RecursiveCharacterTextSplitter.from_language(
#     language=Language.JS, chunk_size=60
# )

js_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.JS, chunk_size=200
)

In [39]:
js_docs = js_splitter.create_documents([js_code])
print(js_docs[0])

page_content='function helloWorld() {\n  console.log("Hello, World!");\n}\nhelloWorld();'


Running this next next cell for the first time generates the error ...

* ImportError: Could not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.

In [None]:
# mamba install conda-forge::tiktoken

In [2]:

from langchain.text_splitter import TokenTextSplitter

# text_splitter = TokenTextSplitter(chunk_size=10)
text_splitter = TokenTextSplitter(chunk_size=200)

Running this next cell sucks up all 32gb of ram, then starts sucking up the swap space until it too maxes out .. had to kill the kernel (around the 4 minute mark). 

It also jumps around 100% usage on a single CPU core ... 

In [3]:
texts = text_splitter.split_text(state_of_the_union)

In [None]:
print(texts[0])

Text Embedding Models

We are going to work with [HuggingFace Embeddings](https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub/)

In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings

The next cell threw the error ...

* ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

In [None]:
# mamba install conda-forge::sentence-transformers

In [9]:
# The all-mpnet-base-v2 model provides the best quality, while all-MiniLM-L6-v2 is 5 times faster and still offers good quality.
HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
embeddings = HuggingFaceEmbeddings()

In [11]:
text = "This is a test document."

In [12]:
query_result = embeddings.embed_query(text)

In [13]:
query_result[:3]

[-0.04895174130797386, -0.03986193612217903, -0.021562788635492325]

Sentence Transformers

https://sbert.net/docs/pretrained_models.html

I added this to show how to use sentence-transformers outside of HuggingFaceEmbeddings.

The following models have been specifically trained for Semantic Search: Given a question / search query, these models are able to find relevant text passages. For more details, see Usage - Semantic Search.

[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)

In [16]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)
print(embeddings)


[[ 0.02250259 -0.07829171 -0.02303074 ... -0.00827929  0.02652689
  -0.00201898]
 [ 0.04170233  0.00109744 -0.0155342  ... -0.02181628 -0.0635936
  -0.00875288]]


In [15]:
# https://sbert.net/docs/pretrained_models.html

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

query_embedding = model.encode("How big is London")
passage_embedding = model.encode([
    "London has 9,787,426 inhabitants at the 2011 census",
    "London is known for its finacial district",
])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

# 0.5s .... after it has already been downloaded
# 1m 195s .... first download

Similarity: tensor([[0.5472, 0.6330]])


Vector Stores