In [None]:
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
from pprint import pprint

proxy_endpoint = 'search-company-sim-4k7ovt44tpdnhmpudn73oyf5cu.eu-west-2.es.amazonaws.com/' # For example, foo.execute-api.us-east-1.amazonaws.com/prod
endpoint_parts = proxy_endpoint.split('/')
host = endpoint_parts[0] # For example, foo.execute-api.us-east-1.amazonaws.com
url_prefix = endpoint_parts[1]
region= 'eu-west-2' # us-east-1

service = 'es' # 

credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service)

es = Elasticsearch(
    hosts = [{'host': host, 'url_prefix': url_prefix, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [None]:
# uploading a row in an index named "news"
document = {
  "content": "Bob has been living in Seattle for more than ten years"
}
es.index(index="news", doc_type="_doc", id="5", body=document)
print(es.get(index="news", doc_type="_doc", id="5"))

In [None]:
# INDEX CREATE
# CREATES THE INDEX FOR KNN 
# In elasticsearch an Index is the collection of documents
query={
  "settings": {
    "index.knn": 'true'
  },
  "mappings": {
    "properties": {
      "my_vector1": {
        "type": "knn_vector",
        "dimension": 2
      },
      "my_vector2": {
        "type": "knn_vector",
        "dimension": 4
      }
    }
  }
}

print(es.indices.create(index='my-index', body=query))

In [None]:
# POST
# POSTS RECORDS into the KNN based index that we created
query={ "my_vector1": [5.5, 6.5], "price": 1.2 }
es.index(index='my-index', id='4', body=query )

In [None]:
#GET
# retrieves the record
query={
  "size": 2,
  "query": {
    "knn": {
      "my_vector1": {
        "vector": [1.5, 2.5],
        "k": 2
      }
    }
  }
}

res= es.search(index='my-index' ,body=query)
pprint(res['hits']['hits'])

In [None]:
# -------------trying bulk upload of real data ------------------------------

# INDEX CREATE
# THIS EXAMPLE WORKED : CREATES THE INDEX FOR KNN
query={
  "settings": {
    "index.knn": 'true'
  },
  "mappings": {
    "properties": {
      "vectors": {
        "type": "knn_vector",
        "dimension": 1000
      }
    }
  }
}

print(es.indices.create(index='cmp-sim', body=query))

In [None]:
# loading the data into pandas dataframe locally
import pandas as pd
final_df=pd.read_json('/home/priyank/Downloads/bulk_vec.json')
final_df

In [None]:
# bulk 
# THIS WORKED : WE ARE ABLE TO LOAD BULK DATA ALL 5K records by keeping high timeout value and small chunk size
from datetime import datetime
from elasticsearch import helpers


actions = [
  {
    "_index": "cmp-sim",
    #"_type": "company",
    "_id": j,
    "_source": {
        "reg": final_df['reg'][j],
        "name": final_df['name'][j],
        "link":final_df['link'][j],
        "vectors":final_df['vectors'][j]}
  }
  for j in range(final_df.shape[0])
]

helpers.bulk(es, actions, request_timeout=200, chunk_size=100 )

In [None]:
# DELETE ALL DATA INSIDE INDEX< WITHOUT DELETING THE INDEX
# THIS WORKS
es.delete_by_query(index="cmp-sim", body={"query": {"match_all": {}}}, request_timeout=200)

In [None]:
# DELETE SPECIFIC DOCUMENTS
# THIS WORKS
es.delete(index="cmp-sim",id=3)

In [None]:
# RUNNING KNN ON 5K RECORDS

#GET
# THIS WORKS
query={
  "size": 2,
  "query": {
    "knn": {
      "vectors": {
        "vector": final_df['vectors'][4997], # sending a vector in the query to perform KNN with this vector
        "k": 2
      }
    }
  }
}


res= es.search(index='cmp-sim' ,body=query)
pprint(res['hits']['hits'])

In [None]:
# prints the name of the companies in the results returned
for r in res['hits']['hits']:
    pprint(r['_source']['name'])

In [None]:
# prints the name of the companies in the results returned
output=[]
for r in res['hits']['hits']:
    output_elmnt={}
    output_elmnt['name']=r['_source']['name']
    output_elmnt['link']=r['_source']['link']
    output_elmnt['reg']=r['_source']['reg']
    output_elmnt['match_score']=r['_score']
    output.append(output_elmnt)

output

# below is an example output
''' [{'name': 'BEACH ENERGY (OTWAY) LIMITED',
  'link': 'toyota-tsusho-uk.com',
  'reg': '04370495',
  'match_score': 1.0},
 {'name': 'GET SMARTER ENERGY LIMITED',
  'link': 'getsmarterenergy.com',
  'reg': '08011426',
  'match_score': 0.17426303}] '''

In [None]:
# ENTER THE REG NUMBER OF THE COMPANY AND IT FINDS THE VECTOR FOR THAT COMPANY
doc = {
    'size' : 10000,
    'query': {
        'match' : {
            'reg': '04370495'
        }
    }
}

res=es.search(index='cmp-sim', body=doc)
res['hits']['hits'][0]['_source']['vectors']

In [None]:
# example of a data table

# 	reg	name	link	vectors
# 0	SC217371	ROLES PRECISION CONTRACTORS LIMITED	rolesprecision.net	[0.2673638463, 0.4009197652, -0.3300765753, -0...
# 1	SC224987	ONE INTEGRATED SOLUTION LIMITED	onesolutiongroup.com	[0.295811981, 0.1737075597, -0.318918258, -0.2...
# 2	OC308164	THE LIVEMORE PARTNERSHIP LLP	livemore.co.uk	[0.056170642400000005, -0.054804641800000005, ...
# 3	NI051458	GLOBE PROPERTY DEVELOPMENTS LIMITED	globe-environmental.co.uk	[0.0292541608, -0.0890476704, -0.1428266913, -...
# 4	SC221598	GOOD MORNING PROJECT LIMITED	goodmorningservice.co.uk	[-0.0592145696, 0.1681548059, 0.0393520556, 0....
# ...	...	...	...	...
# 4995	11057887	THE HATTON MODEL RAILWAY COMPANY LIMITED	hattons.co.uk	[-0.1833461821, 0.0740591288, 0.071855627, 0.1...
# 4996	04292780	UNISON COLOUR LIMITED	unisoncolour.com	[0.23802788560000002, -0.085138686, 0.03595999...
# 4997	04370495	BEACH ENERGY (OTWAY) LIMITED	toyota-tsusho-uk.com	[-0.0250058435, -0.17591276760000002, -0.19892...
# 4998	11630856	M & O PROPERTIES LIMITED	mobuilders.co.uk	[-0.0794660076, 0.0644695535, -0.1396723688, -...
# 4999	10886195	HAYDOCK INVESTMENTS LTD	bwservices.uk.com	[-0.0967349187, -0.1150934696, -0.1456367821, ...
# 5000 rows × 4 columns