# Processing data

Populating a Redis index for vector search.

## from a pdf

If starting from a pdf document that you want to make searchable you can use langchain pointing to your pdf file to break into chunks

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader

doc = "bob/bob_1.pdf" # path to pdf or other type of file to load

# set up the file loader/extractor and text splitter to create chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0
)

loader = UnstructuredFileLoader(
    doc, mode="single", strategy="fast"
)

# extract, load, and make chunks
chunks = loader.load_and_split(text_splitter)

print("Done preprocessing. Created", len(chunks), "chunks of the original pdf", doc)

  loader = UnstructuredFileLoader(
[Errno 2] No such file or directory: 'bob/bob_1.pdf'
PDF text extraction failed, skip text extraction...


Done preprocessing. Created 0 chunks of the original pdf bob/bob_1.pdf


## Save the chunks to a file

In [None]:
import json

output_file = "raw_chunks.json"

with open(output_file, "w") as f:
    json_chunks = [
        {
            "text": chunk.page_content,
            "item_id": i
        } for i, chunk in enumerate(chunks)
    ]

    json.dump(json_chunks, f)

# Example from already created chunks

In [1]:
import json
with open("data/2008-mazda3-chunks.json", "r") as f:
    chunks = json.load(f)

In [2]:
import os
import warnings

warnings.filterwarnings("ignore")

from redisvl.utils.vectorize import HFTextVectorizer

hf = HFTextVectorizer("sentence-transformers/all-MiniLM-L6-v2")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Embed each chunk content
embeddings = hf.embed_many([chunk for chunk in chunks])

# Check to make sure we've created enough embeddings, 1 per document chunk
len(embeddings) == len(chunks)

True

In [None]:
from redis import Redis
from redisvl.index import SearchIndex

REDIS_URL = "redis://localhost:6379/0"

# connect to redis
client = Redis.from_url(REDIS_URL)

# path to the schema file
path_to_yaml = "schema/index_schema.yaml"

# create an index from schema and the client
index = SearchIndex.from_yaml(path_to_yaml)
index.set_client(client)
index.create(overwrite=True, drop=True)

In [127]:
from redisvl.redis.utils import array_to_buffer

data = [
    {
        'chunk_id': i,
        'content': chunk,
        # For HASH -- must convert embeddings to bytes
        'text_embedding': array_to_buffer(embeddings[i], dtype="float32")
    } for i, chunk in enumerate(chunks)
]

# RedisVL handles batching automatically
keys = index.load(data, id_field="chunk_id")

In [128]:
index.info()["num_docs"]

251

In [104]:
import redisvl
redisvl.__version__

'0.3.5'

In [121]:
path_to_yaml = "schema/index_json_schema.yaml"
# create an index from schema and the client
jindex = SearchIndex.from_yaml(path_to_yaml)
jindex.set_client(client)
jindex.create(overwrite=True, drop=True)

11:07:34 redisvl.index.index INFO   Index already exists, overwriting.


In [122]:
jdata = [
    {
        'chunk_id': str(i),
        'content': chunk,
        # For HASH -- must convert embeddings to bytes
        'text_embedding': embeddings[i]
    } for i, chunk in enumerate(chunks)
]

# RedisVL handles batching automatically
keys = jindex.load(jdata, id_field="chunk_id")

In [123]:
jindex.info()['Index Errors']

['indexing failures',
 251,
 'last indexing error',
 'Invalid JSON type: Numeric type can represent only NUMERIC field',
 'last indexing error key',
 'jmazda:106']

In [125]:
jindex.info()["num_docs"]

251

In [108]:
jindex.client.json().get("jmazda:0055927de2b94980ba6d9c35586f7842")

{'chunk_id': 159,
 'content': 'chunk',
 'text_embedding': [-0.03486854210495949,
  -0.005751671735197306,
  -0.04224590212106705,
  -0.03011305443942547,
  -0.09495680779218674,
  0.10199162364006042,
  0.014732243493199348,
  -0.035657722502946854,
  0.06949464976787567,
  0.0605301633477211,
  0.10524240881204604,
  0.0394754484295845,
  -0.0013509726850315928,
  0.048372816294431686,
  -0.09504526108503342,
  0.04534045234322548,
  -0.09370941668748856,
  -0.03166469186544418,
  -0.01622246764600277,
  0.01808370277285576,
  0.053429923951625824,
  -0.05712372064590454,
  0.020160024985671043,
  -0.025900688022375107,
  -0.010816718451678751,
  0.011669243685901163,
  0.03133203461766243,
  0.032539594918489456,
  0.013878904283046722,
  -0.03133326768875122,
  -0.07313910871744156,
  -0.001292429747991264,
  0.038690611720085144,
  -0.05618341639637947,
  -0.03540928289294243,
  -0.09604860097169876,
  0.013300709426403046,
  -0.026457302272319794,
  -0.048852987587451935,
  -0.019

In [95]:
jindex.search("*")

Result{0 total, docs: []}

In [24]:
from redisvl.query import BaseQuery

query = BaseQuery("*")
res = index.query(query)

In [67]:
ID_FIELD_NAME="chunk_id"
CHUNK_FIELD_NAME="content"

def parse_index_items(items, storage_type="hash"):
    if storage_type == "json":
        return [
            {
                "item_id": item[ID_FIELD_NAME],
                "text": item[CHUNK_FIELD_NAME]
            } for item in items]
    else:
        return [
            {
                "item_id": item[ID_FIELD_NAME.encode()].decode(), 
                "text": item[CHUNK_FIELD_NAME.encode()].decode()
            } for item in items.values()]


def get_items_by_pattern(client, pattern, storage_type="hash"):
    cursor = '0'
    items = []

    while cursor != 0:
        cursor, keys = client.scan(cursor=cursor, match=pattern)
        for key in keys:
            if storage_type == "json":
                items.append(client.json().get(key))
            else:
                items.append(client.hgetall(key))



    return parse_index_items(items, storage_type)

In [68]:
# Define the pattern to match keys (e.g., 'index:*' for keys starting with 'index:')
pattern = 'jmazda:*'

# Get all items matching the pattern
matching_items = get_items_by_pattern(index.client, pattern, storage_type="json")

In [65]:
len(matching_items)

251

In [69]:
matching_items[0]

{'item_id': 143,
 'text': '3. [SR + (bank number) + CH + (preset number) + (channel number)] are displayed. (Programming with “text” display (e.g. channel name))\n\n1. Press the display button to select the desired text display mode (e.g. channel name). Select the desired channel to be programmed. At this point, the following is displayed: [SR + (bank number) + (selected text (e.g. channel name))]\n\n2. Keep pressing the channel preset button for 1.5 seconds or more. The programming process is complete after the text (e.g. channel name) flashes. Then, [SR + (bank number)] is displayed and you will hear beep sound at the same time.\n\n3. SR + (bank number) + CH + (preset number) + (channel number)] are displayed.\n\nMazda3_8Y64-EA-08A_Edition1 Page221 Tuesday, November 27 2007 9:1 AM\n\n4. Three seconds later, it returns to normal display. [SR + (bank number) + (text (e.g. channel name))] NOTE Six stations can be stored in each bank, SR1, SR2, and SR3 for convenient access to your favor

In [49]:
for item in matching_items.values():
    print(item['content'.encode('utf-8')])
    # print(item[1][b'chunk_id'])
    break

b'To flash the headlights, pull the lever fully toward you. The headlight switch does not need to be on, and the lever will return to the normal position when released.\n\n5-52\n\n\xc3\xadSome models.\n\nForm No.8Y64-EA-08A\n\nBlack plate (174,1)\n\nqHeadlight Leveling\xc3\xad The number of passengers and weight of cargo in the luggage compartment change the angle of the headlights.\n\nThe headlight leveling switch adjusts this angle.\n\nSelect the proper setting from the following chart. Without turbocharger\n\nFront seat Driver Passenger\n\n\xc3\x97 \xc3\x97 \xc3\x97 \xc3\x97 \xc3\x97\n\n\xe2\x80\x95\n\n\xc3\x97 \xc3\x97 \xc3\x97 \xe2\x80\x95\n\nRear seat \xe2\x80\x95 \xe2\x80\x95\n\n\xc3\x97 \xc3\x97 \xe2\x80\x95\n\nLoad\n\n\xe2\x80\x95 \xe2\x80\x95 \xe2\x80\x95\n\n\xc3\x97 \xc3\x97\n\nSwitch Position\n\n0 0 1 2 3\n\n\xc3\x97: Yes \xe2\x80\x95: No With turbocharger\n\nFront seat Driver Passenger\n\n\xc3\x97 \xc3\x97 \xc3\x97 \xc3\x97 \xc3\x97\n\n\xe2\x80\x95\n\n\xc3\x97 \xc3\x97 \xc

In [57]:
parsed_raw[0]

{'item_id': '115',
 'text': 'To flash the headlights, pull the lever fully toward you. The headlight switch does not need to be on, and the lever will return to the normal position when released.\n\n5-52\n\níSome models.\n\nForm No.8Y64-EA-08A\n\nBlack plate (174,1)\n\nqHeadlight Levelingí The number of passengers and weight of cargo in the luggage compartment change the angle of the headlights.\n\nThe headlight leveling switch adjusts this angle.\n\nSelect the proper setting from the following chart. Without turbocharger\n\nFront seat Driver Passenger\n\n× × × × ×\n\n―\n\n× × × ―\n\nRear seat ― ―\n\n× × ―\n\nLoad\n\n― ― ―\n\n× ×\n\nSwitch Position\n\n0 0 1 2 3\n\n×: Yes ―: No With turbocharger\n\nFront seat Driver Passenger\n\n× × × × ×\n\n―\n\n× × × ―\n\nRear seat ― ―\n\n× × ―\n\nLoad\n\n― ― ―\n\n× ×\n\nSwitch Position\n\n0 0 1 1 2\n\n×: Yes ―: No\n\nMazda3_8Y64-EA-08A_Edition1 Page175 Tuesday, November 27 2007 9:1 AM\n\nqDaytime Running Lights (Canada)\n\nIn Canada, vehicles must be