In [None]:
! pip install elasticsearch
! pip install transformers
! pip install flask
! pip install flask-cors

In [93]:
import pandas as pd
import json
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm.auto import tqdm
import numpy as np
tqdm.pandas()
from transformers import AutoTokenizer, AutoModel
from flask import Flask
from flask import request
from flask_cors import CORS, cross_origin

In [4]:
df = pd.read_pickle('arxiv_embedded_sbert_full.pkl')

In [23]:
username = 'elastic'
password = '3W5dPvw=_sV*D_bmxa=x'
host = {"scheme": "https", "host": "host.docker.internal", "port": 9200}

In [None]:
es = Elasticsearch(
    hosts=[host],
    basic_auth=(username, password),
    verify_certs=False
)

es.ping()

In [49]:
index_mapping = {
    "properties": {
        "id": {
            "type": "long"
        },
        "title": {
            "type": "text"
        },
        "abstract": {
            "type": "text"
        },
        "authors": {
            "type": "text"
        },
        "doi": {
            "type": "text"
        },
        "update_date": {
            "type": "text"
        },
        "search_text": {
            "type": "text"
        },
        "abstract_vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "cosine",
            "index_options": {
                "type": "hnsw",
                "ef_construction": 128,
                "m": 24
            }
        }
    }
}

In [None]:
es.indices.create(index="all_papers_sbert", mappings=index_mapping)

In [57]:
papers_list = df.to_dict("records")

In [None]:
for paper in papers_list:
    try:
        es.index(index="all_papers_sbert", document=paper, id=paper['id'])
    except Exception as e:
        print(e)

In [None]:
es.count(index="all_papers_sbert")

In [15]:
def get_bert_sentence_vectors(model, tokenizer, documents):
  input = tokenizer(documents, return_tensors='pt', padding=True, truncation=True)
  output = model(**input)
  sentence_vec = output.last_hidden_state.mean(dim=1).detach()
  sentence_vec = np.squeeze(np.asarray(sentence_vec))
  return sentence_vec

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [122]:
# Search for Keyword...
def get_search_results(input_keyword):
    input_vec = get_bert_sentence_vectors(model, tokenizer, input_keyword)

    query_vector = {
        "field": "search_text_vector",
        "query_vector": input_vec,
        "k": 100,
        "num_candidates": 1000
    }

    query = {
        "query": {
            "multi_match": {
                "query": input_keyword,
                "fields": ["authors^1.0", "title^6.0", "abstract^4.0"],
                "operator": "and"
            }
        }
    }

    search_results_vector = es.knn_search(index="all_papers_sbert", knn=query_vector, source=['title', 'abstract', 'doi', 'update_date', 'authors'])["hits"]["hits"]
    search_results_text = es.search(index="all_papers_sbert", body=query, source=['title', 'abstract', 'doi', 'update_date', 'authors'])["hits"]["hits"]
    
    return search_results_vector, search_results_text
    
def get_pagerank_results(search_results_vector):
    num_nodes = len(search_results_vector)
    adjacency_matrix = np.zeros((num_nodes, num_nodes))

    for hit in search_results_vector:
        node_id = int(float(hit['_id']))
        neighbors = hit['_source'].get('neighbors', [])
        for neighbor_id in neighbors:
            neighbor_id = int(neighbor_id)
            adjacency_matrix[node_id, neighbor_id] = 1

    num_nodes = adjacency_matrix.shape[0]
    pagerank_scores = np.ones(num_nodes) / num_nodes
    
    num_iterations = 100
    damping_factor = 0.85

    for _ in range(num_iterations):
        denom = num_nodes + damping_factor * np.dot(adjacency_matrix, pagerank_scores)
        pagerank_scores = (1 - damping_factor) / denom + 0.0001
        
    ranked_results = []

    for result, pagerank_score in zip(search_results_vector, pagerank_scores):
        elasticsearch_score = result['_score']
        combined_score = elasticsearch_score + pagerank_score
        result['_combined_score'] = combined_score
        ranked_results.append(result)

    ranked_results.sort(key=lambda x: x['_combined_score'], reverse=True)
    return ranked_results

In [None]:
app = Flask(__name__)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

@app.route('/search')
@cross_origin()
def get_results():
    search_key = request.args.get('search_key')
    search_results_vector, search_results_text = get_search_results(search_key)
    search_results = get_pagerank_results(search_results_text) + get_pagerank_results(search_results_vector)
    return json.dumps(search_results)

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=8080)