In [None]:
import os
import dotenv
import json
import time
from pprint import pprint

import langsmith
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
# from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

import redis
from langchain_redis import RedisConfig, RedisVectorStore

In [None]:
dotenv.load_dotenv()

# eval
os.environ["LANGSMITH_API_KEY"] = dotenv.dotenv_values()["LANGSMITH_API_KEY"]
os.environ["LANGSMITH_TRACING"] = dotenv.dotenv_values()["LANGSMITH_TRACING"]
os.environ["LANGSMITH_PROJECT"] = dotenv.dotenv_values()["LANGSMITH_PROJECT"]

# input data
file_path = dotenv.dotenv_values()["wikiHow_10_articles"]

In [None]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["title"] = record.get("title")

    if "source" in metadata:
        source = metadata["source"].split("/")
        source = source[source.index("data"):]
        metadata["source"] = "/".join(source) # store relative source path

    return metadata

loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key="text",
    metadata_func=metadata_func
)

In [None]:
docs = loader.load()
pprint(docs)

## jq schema reference

1. 
JSON        -> [{"text": ...}, {"text": ...}, {"text": ...}]

jq_schema   -> ".[].text"

2. 

JSON        -> {"key": [{"text": ...}, {"text": ...}, {"text": ...}]}

jq_schema   -> ".key[].text"

3. 

JSON        -> ["...", "...", "..."]

jq_schema   -> ".[]"

In [None]:
CHUNK_SIZE = 500
CHUNK_OVERLAP=300

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True
)

chunked_docs = text_splitter.split_documents(
    documents=docs
)
pprint(chunked_docs)

## Initialize vector db

In [None]:
REDIS_URL = dotenv.dotenv_values()["redis_semantic_cache"]
client = redis.from_url(url=REDIS_URL)
client.ping()

In [None]:
# EMBEDDING_MODEL = "msmarco-distilbert-base-v4" # 66.4M params
EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2" # 17.4M params

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)

In [None]:
config = RedisConfig(
    index_name="article",
    redis_url=REDIS_URL
)

In [None]:
vector_store = RedisVectorStore(embeddings=embedding_model, config=config)

In [None]:
vector_store.add_documents(documents=chunked_docs)

In [None]:
def format_docs(docs):
    # concat mulitple retrieved docs into a single string
    return "\n\n".join(doc.page_content for doc in docs)