In [3]:
!pip install unstructured > /dev/null 

In [34]:
!pip install -Uq markdown langchain openai ken supabase python-dotenv simsimd

In [32]:
from langchain.document_loaders import UnstructuredMarkdownLoader
import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Path to the 'docs/' directory, assuming 'docs/' is in the current working directory
docs_path = './docs/**/*.mdx'

# Use glob.glob to match all .mdx files in the directory and subdirectories
mdx_files = glob.glob(docs_path, recursive=True)

docs = []

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

for file_path in mdx_files:
    print(file_path)
    # loader = UnstructuredMarkdownLoader(file_path=mdx_file)
    # doc = loader.load_and_split(text_splitter=text_splitter)
    # docs = [*docs, *doc]

    with open(file_path, 'r', encoding='utf-8') as file:
        mdx_content = file.read()
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ]

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(mdx_content)

        # Split
        splits = text_splitter.split_documents(md_header_splits)
        docs = [*docs, *splits]

len(docs)

./docs/stroke-width.mdx
./docs/columns.mdx
./docs/transition-duration.mdx
./docs/grid-column.mdx
./docs/backdrop-brightness.mdx
./docs/scroll-snap-type.mdx
./docs/configuration.mdx
./docs/pointer-events.mdx
./docs/padding.mdx
./docs/box-sizing.mdx
./docs/translate.mdx
./docs/divide-style.mdx
./docs/drop-shadow.mdx
./docs/content.mdx
./docs/outline-width.mdx
./docs/transition-delay.mdx
./docs/plugins.mdx
./docs/ring-offset-color.mdx
./docs/user-select.mdx
./docs/flex-basis.mdx
./docs/hue-rotate.mdx
./docs/theme.mdx
./docs/caret-color.mdx
./docs/grid-row.mdx
./docs/transform-origin.mdx
./docs/background-image.mdx
./docs/justify-content.mdx
./docs/transition-timing-function.mdx
./docs/animation.mdx
./docs/grayscale.mdx
./docs/grid-auto-rows.mdx
./docs/box-decoration-break.mdx
./docs/visibility.mdx
./docs/max-width.mdx
./docs/rotate.mdx
./docs/grid-template-columns.mdx
./docs/blur.mdx
./docs/max-height.mdx
./docs/upgrade-guide.mdx
./docs/scroll-snap-stop.mdx
./docs/ring-width.mdx
./docs/to

1960

In [5]:
from supabase.client import Client, create_client
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.supabase import SupabaseVectorStore
import os
from dotenv import load_dotenv

load_dotenv()

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embeddings = OpenAIEmbeddings()

docs[10]

NameError: name 'docs' is not defined

In [24]:
# vector_store = SupabaseVectorStore.from_documents(
#     documents=docs[600:],
#     embedding=embeddings,
#     client=supabase,
#     table_name="tailwind_documents",
#     query_name="match_tailwind_documents",
#     # table_name="documents",
#     # query_name="match_documents",
#     chunk_size=100,
#     # show_progress=True
# )

vector_store = SupabaseVectorStore(
    embedding=embeddings,
    client=supabase,
    chunk_size=100,
    table_name="tailwind_documents",
    query_name="match_tailwind_documents",
)

# from typing import List

# chunk_size = 500  # You can modify this value to a smaller number
# id_list: List[str] = []
# for i in range(0, len(docs), chunk_size):
#     chunk = docs[i : i + chunk_size]

#     result = supabase.from_("tailwind_documents").upsert(chunk).execute()  # type: ignore

#     if len(result.data) == 0:
#         raise Exception("Error inserting: No rows added")

#     # VectorStore.add_vectors returns ids as strings
#     ids = [str(i.get("id")) for i in result.data if i.get("id")]

#     id_list.extend(ids)

## DocumentCompressorPipeline

Since a lot of context could be retreived per query we want to chain a few compression and reordering steps as necessary to optimize what gets fed to the final context.

In [14]:
# Helper function for printing docs

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [25]:
docs = vector_store.as_retriever().get_relevant_documents("What is the correct way to apply a gradient to a div?")
pretty_print_docs(docs)

2023-11-06 20:14:44,306:INFO - HTTP Request: POST https://qublpyarwoevdeqqicbz.supabase.co/rest/v1/rpc/match_tailwind_documents?limit=4 "HTTP/1.1 200 OK"


Document 1:

<div class="relative">
<div class="absolute inset-6 backdrop-brightness-200 h-20 w-20 bg-white/30"></div>
<img class="w-32 h-32 object-cover rounded-lg shadow-xl" src="https://images.unsplash.com/photo-1554629947-334ff61d85dc?ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&ixlib=rb-1.2.1&auto=format&fit=crop&w=320&h=320&q=80" />
<div class="absolute inset-0 ring-1 ring-inset ring-black/10 rounded-lg"></div>
</div>
</div>
</div>
</div>
```  
```html
<div class="**backdrop-brightness-50** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-125** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-200** bg-white/30 ...">
<!-- ... -->
</div>
```
----------------------------------------------------------------------------------------------------
Document 2:

/* Output */
.bg-brand-gradient {
background-image: linear-gradient(#3490dc, #6574cd) !important;
}
```  
#### Selector strategy  
Setting `important` to `true` can introduce some i

### First Try: LLMChainFilter
Now let's wrap our base retriever with a ContextualCompressionRetriever. We'll add an LLMChainExtractor, which will iterate over the initially returned documents and extract from each only the content that is relevant to the query.

In [31]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainFilter

llm = ChatOpenAI(temperature=0)
_filter = LLMChainFilter.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=vector_store.as_retriever())

compressed_docs = compression_retriever.get_relevant_documents("What is the correct way to apply a gradient to a div?")
pretty_print_docs(compressed_docs)

2023-11-06 20:37:04,983:INFO - HTTP Request: POST https://qublpyarwoevdeqqicbz.supabase.co/rest/v1/rpc/match_tailwind_documents?limit=4 "HTTP/1.1 200 OK"


Document 1:

<div class="relative">
<div class="absolute inset-6 backdrop-brightness-200 h-20 w-20 bg-white/30"></div>
<img class="w-32 h-32 object-cover rounded-lg shadow-xl" src="https://images.unsplash.com/photo-1554629947-334ff61d85dc?ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&ixlib=rb-1.2.1&auto=format&fit=crop&w=320&h=320&q=80" />
<div class="absolute inset-0 ring-1 ring-inset ring-black/10 rounded-lg"></div>
</div>
</div>
</div>
</div>
```  
```html
<div class="**backdrop-brightness-50** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-125** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-200** bg-white/30 ...">
<!-- ... -->
</div>
```


### EmbeddingsFilter

Making an extra LLM call over each retrieved document is expensive and slow. The EmbeddingsFilter provides a cheaper and faster option by embedding the documents and query and only returning those documents which have sufficiently similar embeddings to the query.

In [32]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = OpenAIEmbeddings()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=vector_store.as_retriever())

compressed_docs = compression_retriever.get_relevant_documents("What is the correct way to apply a gradient to a div?")
pretty_print_docs(compressed_docs)

2023-11-06 20:38:06,222:INFO - HTTP Request: POST https://qublpyarwoevdeqqicbz.supabase.co/rest/v1/rpc/match_tailwind_documents?limit=4 "HTTP/1.1 200 OK"
2023-11-06 20:38:08,035:INFO - Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.


Document 1:

<div class="relative">
<div class="absolute inset-6 backdrop-brightness-200 h-20 w-20 bg-white/30"></div>
<img class="w-32 h-32 object-cover rounded-lg shadow-xl" src="https://images.unsplash.com/photo-1554629947-334ff61d85dc?ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&ixlib=rb-1.2.1&auto=format&fit=crop&w=320&h=320&q=80" />
<div class="absolute inset-0 ring-1 ring-inset ring-black/10 rounded-lg"></div>
</div>
</div>
</div>
</div>
```  
```html
<div class="**backdrop-brightness-50** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-125** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-200** bg-white/30 ...">
<!-- ... -->
</div>
```
----------------------------------------------------------------------------------------------------
Document 2:

/* Output */
.bg-brand-gradient {
background-image: linear-gradient(#3490dc, #6574cd) !important;
}
```  
#### Selector strategy  
Setting `important` to `true` can introduce some i

### Stringing compressors and document transformers together

Using the `DocumentCompressorPipeline` we can also easily combine multiple compressors in sequence. Along with compressors we can add BaseDocumentTransformers to our pipeline, which don't perform any contextual compression but simply perform some transformation on a set of documents.

For example `TextSplitters` can be used as document transformers to split documents into smaller pieces, and the `EmbeddingsRedundantFilter` can be used to filter out redundant documents based on embedding similarity between documents.

Below we create a compressor pipeline by first splitting our docs into smaller chunks, then removing redundant documents, and then filtering based on relevance to the query.

In [38]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.document_transformers import LongContextReorder

reordering = LongContextReorder()
splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)

pipeline_compressor = DocumentCompressorPipeline(
    transformers=[
        # splitter,
        # redundant_filter,
        relevant_filter,
        reordering
    ]
)

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=vector_store.as_retriever())

compressed_docs = compression_retriever.get_relevant_documents("What is the correct way to apply a gradient to a div?")
pretty_print_docs(compressed_docs)

2023-11-06 20:42:58,946:INFO - HTTP Request: POST https://qublpyarwoevdeqqicbz.supabase.co/rest/v1/rpc/match_tailwind_documents?limit=4 "HTTP/1.1 200 OK"


Document 1:

<div class="relative">
<div class="absolute inset-6 backdrop-brightness-200 h-20 w-20 bg-white/30"></div>
<img class="w-32 h-32 object-cover rounded-lg shadow-xl" src="https://images.unsplash.com/photo-1554629947-334ff61d85dc?ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&ixlib=rb-1.2.1&auto=format&fit=crop&w=320&h=320&q=80" />
<div class="absolute inset-0 ring-1 ring-inset ring-black/10 rounded-lg"></div>
</div>
</div>
</div>
</div>
```  
```html
<div class="**backdrop-brightness-50** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-125** bg-white/30 ...">
<!-- ... -->
</div>
<div class="**backdrop-brightness-200** bg-white/30 ...">
<!-- ... -->
</div>
```
----------------------------------------------------------------------------------------------------
Document 2:

```  
```html
<div class="grid grid-cols-3 gap-4">
<div class="...">01</div>
<div class="...">02</div>
<div class="...">03</div>
<div class="**col-span-2** ...">04</div>
<div class=

# Compression Filter Docs
https://python.langchain.com/docs/modules/data_connection/retrievers/contextual_compression/

In [39]:
from langchain.chat_models import ChatOpenAI
# from langchain.chat_models import OpenAI
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.retrievers import RePhraseQueryRetriever
from langchain.memory import ConversationBufferMemory


# Instantiate ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

DEFAULT_TEMPLATE = """You are an assistant tasked with taking a natural language \
query from a user and converting it into a query for a vectorstore. \
In this process, you strip out information that is not relevant for \
the retrieval task. Here is the user query: {question}"""

llm = ChatOpenAI(temperature=0)
retriever_from_llm = RePhraseQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(), llm=llm
)
# docs = retriever_from_llm.get_relevant_documents("How do I load documents from Hacker News?")

qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=compression_retriever, memory=memory, return_source_documents=True)

# qa = RetrievalQA.from_llm(llm=llm, retriever=retriever_from_llm, memory=memory, return_source_documents=True, document_transformer=reorder)
# qa = RetrievalQA(document_transformer=reorder)

In [40]:
qa({ "question": "How do I make a full page gradient background?" })

2023-11-06 20:44:39,140:INFO - HTTP Request: POST https://qublpyarwoevdeqqicbz.supabase.co/rest/v1/rpc/match_tailwind_documents?limit=4 "HTTP/1.1 200 OK"


{'question': 'How do I make a full page gradient background?',
 'chat_history': [HumanMessage(content='How do I make a full page gradient background?'),
  AIMessage(content='To create a full-page gradient background, you can use the `bg-gradient-to` utility class along with the `from` and `to` values to define the gradient direction. Here\'s an example:\n\n```html\n<div class="bg-gradient-to-r from-blue-500 to-purple-500 h-screen">\n  <!-- Your content here -->\n</div>\n```\n\nIn this example, the `bg-gradient-to-r` class creates a gradient that transitions from the color defined by the `from-blue-500` class to the color defined by the `to-purple-500` class. The `h-screen` class ensures that the gradient covers the entire height of the screen.\n\nYou can customize the colors and gradient direction by using different color classes and gradient utility classes. For example, you can use `bg-gradient-to-l` for a left-to-right gradient or `bg-gradient-to-b` for a top-to-bottom gradient.\n\n