In [1]:
import pandas as pd
import transformers
from embedding_model import Model

In [2]:
# Load pretrained Model
model = Model()
model.make_pretrained('stsb-distilbert-base')

Model Compiled: stsb-distilbert-base


In [3]:
# Loading dataset
# Big JSON file 
import json
arxiv = json.load(open('data/arxivData.json'))

# Get it in line
def join_ds(x, useful_keys):
    return '. '.join([x[useful_keys[0]], x[useful_keys[1]]])

# Keeping just the title and summary
useful_keys = ['title', 'summary']
arxiv_data = list(
                map(lambda x: join_ds(x, useful_keys), 
                    arxiv
                    )
                )
# Make numeric labels that correspond to the individual documents 
arxiv_data_labels = list(range(len(arxiv_data)))

# Make Label to document mapping for pretty outputs
label_mapping = dict(
                    zip(arxiv_data_labels, 
                        arxiv_data
                        )
                    )


In [4]:
# Turn documents into vectors of dim (sample_len, 768)
# Subselecting a few samples, 
i = 21
sample_len = 50

data_vec = model.encode_sentences(arxiv_data[i:sample_len])
data_labels = arxiv_data_labels[i:sample_len]
print("Data to be added to the index: ", len(data_labels), data_vec.shape)

Data to be added to the index:  29 (29, 768)


In [5]:
# Make Index from docs
import os
import shutil
import re
import hnswlib
import numpy as np
from utils import *

# Parameters to initiate the HNSW Index
HNSW_PARAMS = {
    "save_file": 'hnsw_index.bin',
    "M": 16,
    "ef_construction": 200,
    "num_threads": MAX_SEARCH_THREADS,
    "num_elements": 50,
    "label_mapping": label_mapping 
}

from search_index import Index

# Initiate the Search Index wrapper class 
hnsw_index = Index(HNSW_PARAMS)


In [6]:
print(hnsw_index)


        HNSW Index Params: 
            SAVE_DIR: /mnt/z/Project/semantic_search/qualia/model/v1/hnsw_index/,
            SAVE_FILE: hnsw_index.bin,
            CURR_IDX_SIZE: 49,
            M: 16,
            ef_construction: 200,
            item_batch_size: 10,
            num_threads: -1,
            num_elements: 50,
            index_loaded: False
        


In [7]:
# Create a new index with the given params and save it to given file 
hnsw_index.define_index(idx_size=10)
hnsw_index.init_search()

First time?
Saving Fresh New index: /mnt/z/Project/semantic_search/qualia/model/v1/hnsw_index/hnsw_index.bin
Save index size: 10
Loading saved index


In [8]:
# Add the data to the index
hnsw_index.update_index(data_vec, data_labels)


            Loading Previously saved index:
                Loading File:     hnsw_index.bin
                New max_elements: 39
        
Adding n=10 batches of items from the data
29 29
Adding keys to update list
Adding keys to update list
Adding keys to update list
Saving new index of size 29
Save index size: 29


In [9]:
MAX_NEAREST_NBRS = 5
matches = hnsw_index.search(
            model.encode_sentences(
                "reinforcement"
                ),
            max_nearest_nbrs=MAX_NEAREST_NBRS
            )

In [10]:
hnsw_index.get_current_count()

29

In [11]:
len(matches['matches'])

5