## Installing dependencies

In [1]:
!pip install pandas pyarrow

Collecting pandas
  Downloading pandas-2.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m96.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyarrow
  Downloading pyarrow-13.0.0-cp39-cp39-manylinux_2_28_x86_64.whl (40.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pyarrow, pandas
Successfully installed pandas-2.1.0 pyarrow-13.0.0 tzdata-2023.3


## Reading data

In [2]:
import pandas as pd
import numpy as np
import os

df = pd.read_parquet("short_articles.parquet")
df = df[df["text"].str.len() > 0].sample(frac=0.25)
df = df.reset_index()

## Formatting data

In [3]:
## collect our ids for each article
ids = df["article_id"].tolist()
## collect the properties that we will attach to each vector
properties = df.apply(
    lambda r:{ 
        "url": r.url, 
        "title": r.title, 
        "title_len": r.title_len, 
        "text": r.text, 
        "text_len": r.text_len}
    , axis=1
).tolist()


## Fetching or Creating Collection

In [4]:
from qwak.exceptions import QwakException
from qwak.vector_store import VectorStoreClient

## Create vector client and fetch collection
client = VectorStoreClient()

# Retrieve a collection or create a new one
collection_name = "wikipedia-vectorizer-demo"
try:
    collection = client.get_collection_by_name(collection_name)
except QwakException:
    collection = client.create_collection(
        name=collection_name,
        description="Indexing Wikipedia articles ",
        dimension=384,
        metric="cosine",
        vectorizer="sentence_transformer"  # The name of a deployed realtime model on Qwak
    )

## Inserting Data into the Collection

In [5]:
data_len = -1
collection.upsert(
    ## List of the article ids
    ids=ids[:data_len],
    # Natural inputs
    natural_inputs=df['text'][:data_len].tolist(),
    ## List of dict of the article properties
    properties=properties[:data_len]
)

## Search for Similar Results

In [6]:
from qwak.vector_store import VectorStoreClient

## Search vector store using vector provided by model
search_results = collection.search(
    natural_input="Ducks", 
    top_results=3, 
    output_properties=["title", "title_len", "url"], 
    include_distance=True, 
    include_vector=False
)

## Search for Similar Results

In [8]:
[print(x.properties, x.distance) for x in search_results]

{'title': 'Duck', 'title_len': 4.0, 'url': 'https://simple.wikipedia.org/wiki/Duck'} 0.4547128
{'title': 'Mallard', 'title_len': 7.0, 'url': 'https://simple.wikipedia.org/wiki/Mallard'} 0.4711383
{'title': 'Donald Duck', 'title_len': 11.0, 'url': 'https://simple.wikipedia.org/wiki/Donald%20Duck'} 0.5375511


[None, None, None]