In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [4]:
dataset = pd.read_csv('../../data/paragraph_clean_data.csv')

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  79 non-null     int64 
 1   paragraph   79 non-null     object
 2   url         79 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.0+ KB


In [6]:
encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
# Utility function for generating sentence embedding from the text
def get_embeddinngs(text):
    return encoder_model.encode(text)


In [7]:

# Generating sentence embedding from the text
dataset['embeddings'] = dataset['paragraph'].apply(get_embeddinngs)

In [8]:
import requests
substring = "You Know, for Search".encode()
response = requests.get("http://localhost:9200")
print(response)
if substring in response.content:
   print("Elasticsearch is up and running!")
else:
   print("Something went wrong, ensure the cluster is up!")

<Response [200]>
Elasticsearch is up and running!


In [9]:
dataset['embeddings'][0].shape

(384,)

In [10]:
settings= {
    "number_of_shards": 1,
  }
mappings = {
    "properties": {
         "embeddings": { 
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine" },
          "paragraph":{ "type":"text"},
          "url":{ "type":"text"}
    }
  }

In [11]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.indices.delete(index='articles')
es.indices.create(index='articles', settings=settings, mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'articles'})

In [12]:
rows = 0
elk_data = dataset.to_dict("records") 

for article_row in elk_data:
  es.index(index="articles",document=article_row)
  rows += 1
print("Total articles inserted: {}".format(rows))

Total articles inserted: 79


In [19]:
# Inference Code

def search(query, limit=3):
  token_vector = get_embeddinngs(query)
  es_query ={
   "size":limit,
   "knn": {
    "field": "embeddings",
    "query_vector": token_vector,
    "k": 10,
    "num_candidates": 100
  },
  }
  results = pd.DataFrame(columns = ["similarity_score", "article_url",'article_text'])
  for result in es.search(index="articles", body=es_query)["hits"]["hits"]:
    #print(result['_score'])
    #print(result['_source']['paragraph'])
    new_row = pd.Series({'similarity_score': result['_score'], 'article_url':result['_source']['url'],'article_text':result['_source']['paragraph']})
    results = pd.concat([results, new_row.to_frame().T], ignore_index=True)
  return results

In [20]:
query = "China and Tesla"

results = search(query, 5)
print(results)

  similarity_score                                        article_url  \
0         0.812898  https://www.reuters.com/video/watch/idOV554115...   
1         0.781714  https://www.reuters.com/business/autos-transpo...   
2         0.762682  https://www.reuters.com/business/autos-transpo...   
3         0.759719  https://www.reuters.com/business/autos-transpo...   
4         0.756305  https://www.reuters.com/business/autos-transpo...   

                                        article_text  
0  Business Tesla weighs China sales reset Posted...  
1  SHANGHAI Sept 15 Reuters Tesla is reevaluating...  
2  Register now for FREE unlimited access to Reut...  
3  Register now for FREE unlimited access to Reut...  
4  Tesla Inc CEO Elon Musk and Shanghai is Mayor ...  


  for result in es.search(index="articles", body=es_query)["hits"]["hits"]:
