<a href="https://colab.research.google.com/github/sehsanm/openalexindex/blob/main/Open_Alex_Indexing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://openalex.s3.amazonaws.com/data/works/updated_date%3D2023-05-28/part_000.gz

In [None]:
!pip install jsonlines sentence-transformers chromadb 
!pip install InstructorEmbedding
!pip install qdrant-client

In [None]:
import jsonlines
import gzip
def convert_inverted_index(inverted_index_map): 
  if inverted_index_map == None:
    return None
  max = 0 
  for token, locs in inverted_index_map.items():
    for i in locs:
      if max < i:
        max = i 
  if max == 0: 
    return None 
  lst = [''] * max 
  for token, locs in inverted_index_map.items():
    for i in locs:
      lst[i-1] = token 
  return ' '.join(lst)

# Replace this with the path to your gzipped jsonlines file
gzipped_jsonlines_file_path = 'part_000.gz'

# Open the gzipped file
objects = list()
with gzip.open(gzipped_jsonlines_file_path, 'rt') as file:
    # Wrap the file with jsonlines.Reader
    with jsonlines.Reader(file) as reader:
        for obj in reader:
            # Process each line (a JSON object) in the jsonlines file
            objects.append({'id' : obj['id'] , 'title' : obj['title'], 'abstract' : convert_inverted_index(obj['abstract_inverted_index'])})
            if len(objects) % 50000 == 0:
              print(f"{len(objects):10,} records processed")

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings

#Embedding function to use GPU 
ef = embedding_functions.InstructorEmbeddingFunction( model_name="hkunlp/instructor-base", device="cuda")

#Persistent data store to allow continue indexing 
chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory="/content"
                                ))
collection = chroma_client.create_collection(name="openalex", embedding_function=ef)
batch_size = 2000 
batch_index = 0 
docs = list() 
doc_ids = list() 
for index, doc in enumerate(objects):
  if len(collection.get(ids=doc['id'])['ids']) > 0 :
    #We want to be able to run this multiple times     
    continue
  docs.append(f"{doc['title']}\n{doc['abstract']}") 
  doc_ids.append(doc['id'])
  if  len(docs) >= batch_size or index == len(objects):
    batch_index = batch_index + 1 
    collection.add(
        documents=docs,
        ids=doc_ids)  
    
    docs.clear() 
    doc_ids.clear() 
    print(f"Next batch processed. index:{index+1:,}")
    if batch_index % 10 == 0:
      x = input('Do you want to stop processing further?')
      if x == 'y' : 
        break



In [None]:
while True:
  q = input("Enter the query:")
  if len(q) == 0: 
    break 
  results = collection.query(
      query_texts=[q],
      n_results=10)
  for index, doc in enumerate(results): 
    print(results['ids'][0][index])
    print(results['documents'][0][index])


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams


client = QdrantClient(path="/content/qdrant")

client.recreate_collection(
    collection_name="openalex",
    vectors_config=VectorParams(size=100, distance=Distance.COSINE),
)

client.upsert(
    collection_name="my_collection",
    points=[
        PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload={"color": "red", "rand_number": idx % 10}
        )
        for idx, vector in enumerate(vectors)
    ]
)