In [18]:
import pandas as pd
QUERIES_CSV_FILE = "/workspace/datasets/train.csv"
df_queries = pd.read_csv(QUERIES_CSV_FILE, nrows = 1000)
df_queries.head()

Unnamed: 0,user,sku,category,query,click_time,query_time
0,000000df17cd56a5df4a94074e133c9d4739fae3,2125233,abcat0101001,Televisiones Panasonic 50 pulgadas,2011-09-01 23:44:52.533,2011-09-01 23:43:59.752
1,000001928162247ffaf63185cd8b2a244c78e7c6,2009324,abcat0101001,Sharp,2011-09-05 12:25:37.42,2011-09-05 12:25:01.187
2,000017f79c2b5da56721f22f9fdd726b13daf8e8,1517163,pcmcat193100050014,nook,2011-08-24 12:56:58.91,2011-08-24 12:55:13.012
3,000017f79c2b5da56721f22f9fdd726b13daf8e8,2877125,abcat0101001,rca,2011-10-25 07:18:14.722,2011-10-25 07:16:51.759
4,000017f79c2b5da56721f22f9fdd726b13daf8e8,2877134,abcat0101005,rca,2011-10-25 07:19:51.697,2011-10-25 07:16:51.759


In [56]:
df_queries_small = pd.read_csv(QUERIES_CSV_FILE, nrows = 6000).assign(doc_id = list(range(0, 6000)))
df_queries_small.to_csv("/tmp/queries.csv", index=False)

In [161]:
from opensearchpy.helpers import bulk
def prepare_batch_queries(batch):
    return (
        batch
          .assign(click_time = lambda d: pd.to_datetime(d.click_time, format='ISO8601'))
          .assign(query_time = lambda d: pd.to_datetime(d.query_time, format='ISO8601'))
    )

def index_batch(batch, index_name:str):
    records = batch.to_dict(orient = "records")
    docs = [{"_id": uuid.uuid4().hex, "_index": index_name, "_source": record, } for record in records]
    bulk(client, docs, request_timeout=60)
    return len(batch)

In [131]:
from tqdm import tqdm
from functools import partial
from concurrent.futures import ProcessPoolExecutor
from itertools import repeat
def index_batches(batches, index_name:str):
    with ProcessPoolExecutor(max_workers=8) as pool:
        results = list(tqdm(pool.map(index_batch, batches, repeat(index_name)), total=len(batches)))
    print(f"Indexed {sum(results)} records")
    

In [132]:
from opensearchpy import OpenSearch
from IPython.display import JSON
import json

def print_json(x):
    print(json.dumps(x, indent = 2))
    
client = OpenSearch(
    hosts = [{"host": "localhost", "port": 9200}],
    http_auth = ("admin", "admin"),
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
print_json(client.info())

{
  "name": "fc81333c71df",
  "cluster_name": "docker-cluster",
  "cluster_uuid": "_Xsvjrs0TJOF3p6QlsWnJA",
  "version": {
    "distribution": "opensearch",
    "number": "2.9.0",
    "build_type": "tar",
    "build_hash": "1164221ee2b8ba3560f0ff492309867beea28433",
    "build_date": "2023-07-18T21:23:29.367080729Z",
    "build_snapshot": false,
    "lucene_version": "9.7.0",
    "minimum_wire_compatibility_version": "7.10.0",
    "minimum_index_compatibility_version": "7.0.0"
  },
  "tagline": "The OpenSearch Project: https://opensearch.org/"
}


In [162]:
%pip install pyyaml > /dev/null
import yaml
body = yaml.safe_load("""
settings:
  index:
    query:
      default_field: body

""")
response = client.indices.create("test-index-to-delete-1", body = body)
print_json(response)

You should consider upgrading via the '/home/gitpod/.pyenv/versions/3.9.7/envs/search_fundamentals/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "test-index-to-delete-1"
}


In [163]:
batches = list(pd.read_csv("/tmp/queries.csv", chunksize=3000))
len(batches)
# batches = [prepare_batch_queries(batch) for batch in batches]

2

In [164]:
index_batches(batches, "test-index-to-delete-1")

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  3.08it/s]

Indexed 6000 records





In [165]:
client.indices.refresh(index = "test-index-to-delete-1")
client.cat.count(index = "test-index-to-delete-1", format="json")

[{'epoch': '1695789379', 'timestamp': '04:36:19', 'count': '6000'}]

In [173]:
def display_search_response(response):
  hits = response['hits']['hits']
  sources = [hit["_source"] for hit in hits]
  return pd.concat([pd.DataFrame(hits).drop(["_source"], axis=1), pd.DataFrame(sources)], axis=1)

In [174]:
# Search Documents
body_yaml = """
query:
  match_all: {}
size: 10000
"""
response = client.search(
  index = "test-index-to-delete-1",
  body = yaml.safe_load(body_yaml)
)
out = display_search_response(response)

In [175]:
out.head()

Unnamed: 0,_index,_id,_score,user,sku,category,query,click_time,query_time,doc_id
0,test-index-to-delete-1,0d544c47be1f4feeacfe40fad2defe59,1.0,0065873bd2a2a4700e9f60a850afcaaed9361184,9937316,abcat0301014,Truck gps,2011-09-29 00:47:53.004,2011-09-29 00:45:33.186,3018
1,test-index-to-delete-1,6733fe0976fb4b50bdb3be9ef281c80f,1.0,00007557d9b11f2f1a99792317963d30174171ba,3168067,cat02713,Watch The Throne,2011-09-04 10:55:20.427,2011-09-04 10:55:10.874,12
2,test-index-to-delete-1,fa3200908ba541bd91c6511939570a2e,1.0,0000776d7bf35b984ca8e3671327a7ac1d07a86c,7997055,pcmcat224000050003,Remote control extender,2011-10-28 16:26:29.203,2011-10-28 16:26:20.358,13
3,test-index-to-delete-1,d68d2583952847f6b9a1ba3e92381042,1.0,00659ce53484a6e186752394df313f835f289f70,2658129,pcmcat212600050008,Mac mini,2011-09-07 20:09:22.633,2011-09-07 20:08:08.387,3022
4,test-index-to-delete-1,8b1c88ba1c634d728204ffa7d0281a3c,1.0,0000870469b85f38ceba4b1add61419eb8da9dc5,1988047,abcat0707001,3ds,2011-09-23 22:14:08.965,2011-09-23 22:13:37.43,15


In [160]:
client.indices.delete('test-index-to-delete-1')

{'acknowledged': True}