In [1]:
try:
    import json
    import os
    import uuid

    import pandas as pd
    import numpy as np

    import elasticsearch
    from elasticsearch import Elasticsearch
    from elasticsearch import helpers
    from sentence_transformers import SentenceTransformer, util
    from tqdm import tqdm
    from dotenv import load_dotenv
    load_dotenv("secret.env")

except Exception as e:
    print("Some Modules are Missing :{}".format(e))

es_host = "localhost"
es_port = 9200
es_scheme = 'http'

node_config = {
    'scheme': es_scheme,
    'host': es_host,
    'port': es_port,
}
es = Elasticsearch([node_config])


In [2]:
try:
    # Attempt to create an Elasticsearch connection
    es = Elasticsearch([node_config])
    print("Connected to Elasticsearch")

    # Check Elasticsearch endpoint (for debugging)
    print(f"Elasticsearch Endpoint: {node_config['scheme']}://{node_config['host']}:{node_config['port']}")

except Exception as e:
    print(f"Error: {e}")

Connected to Elasticsearch
Elasticsearch Endpoint: http://localhost:9200


In [3]:
class Reader(object):
    def __init__(self, file_name):
        self.file_name = file_name

    def run(self):
        df = pd.read_csv(self.file_name)
        df = df.fillna("")  # Fill NaN values with empty strings
        return df

In [4]:
class Tokenizer(object):
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def get_token(self, documents):
        sentences = [documents]
        sentence_embeddings = self.model.encode(sentences)
        _ = list(sentence_embeddings.flatten())
        encod_np_array = np.array(_)
        encod_list = encod_np_array.tolist()
        return encod_list

In [5]:
class ElasticSearchImports(object):
    def __init__(self, df, index_name='posting'):
        self.df = df
        self.index_name = index_name
        self.es = Elasticsearch([{'host': 'localhost', 'port': 4028, 'scheme': 'http'}])

    def run(self):
        elk_data = self.df.to_dict("records")
        for job in elk_data:
            try:
                self.es.index(index=self.index_name, document=job)
            except Exception as e:
                pass
        return True


In [6]:
helper = Reader(file_name="C:\\Users\\putariza\\Documents\\Documents\\ML\\nlp lab\\pharma-talk\\medicine_dataset.csv")
df = helper.run()

In [None]:
tqdm.pandas()
helper_token = Tokenizer()
df["vectors"] = df["Drugs"].progress_apply(helper_token.get_token)

In [None]:
helper_elk = ElasticSearchImports(df=df)
helper_elk.run()

In [None]:
# Replace this input query with the drug name you want to search for
INPUT = input("Enter Query: ")

# Tokenize the input query
helper_token = Tokenizer()
token_vector = helper_token.get_token(INPUT)

# Query Elasticsearch
query = {
    "size": 50,
    "_source": "Drugs",
    "query": {
        "bool": {
            "must": [
                {
                    "knn": {
                        "vectors": {
                            "vector": token_vector,
                            "k": 20
                        }
                    }
                }
            ]
        }
    }
}

es = Elasticsearch(timeout=600, hosts=os.getenv("ENDPOINT"))
res = es.search(index='posting',
                size=50,
                body=query,
                request_timeout=55)

titles = [hit['_source']['Drugs'] for hit in res['hits']['hits']]

print("Drugs that were found based on your query:")
for title in titles:
    print(title)
