In [11]:
import json
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
es = Elasticsearch(
    "https://localhost:9200",
    # ca_certs=".\certs\ca\ca.crt",
    basic_auth=("elastic", "hammasir"),
    verify_certs=False,
    ssl_show_warn=False,
)
es.info()

ObjectApiResponse({'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'c2RGzljpSXu-l8gRmdFDqQ', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [12]:
with open("../data/processed/base_dataset.json") as f:
    json_data = json.load(f)

In [13]:
import pandas as pd

data = pd.read_csv("../data/processed/base_dataset.csv")

In [14]:
import ast


def safe_literal_eval(val):
    try:
        if isinstance(val, str):
            return ast.literal_eval(val)
        else:
            return val  # If it's not a string, return the original value
    except (ValueError, SyntaxError):
        # Return None or a default value if evaluation fails
        return None

In [15]:
data["clinic"] = data["clinic"].apply(safe_literal_eval)
# data["insurances"] = data["insurances"].apply(safe_literal_eval)

In [16]:
import numpy as np

data = data.replace(np.nan, None)

In [17]:
mappings = {
    "properties": {
        "gender": {"type": "text"},
        "expertise": {"type": "text"},
        "title": {"type": "text"},
        "star": {"type": "float"},
        "rates_count": {"type": "integer"},
        "number_of_visits": {"type": "integer"},
        "view": {"type": "text"},
        "insurances": {"type": "text"},
        "experience": {"type": "integer"},
        "doctor_encounter": {"type": "float"},
        "explanation_of_issue": {"type": "float"},
        "quality_of_treatment": {"type": "float"},
        "comments_count": {"type": "integer"},
        "waiting_time": {"type": "float"},
        "clinic": {"type": "object"},
    }
}

In [18]:
es.indices.delete(index="doctors")

ObjectApiResponse({'acknowledged': True})

In [19]:
es.indices.create(
    index="doctors",
    mappings=mappings,
    settings={
        "analysis": {
            "char_filter": {
                "zero_width_spaces": {
                    "type": "mapping",
                    "mappings": ["\\u200C=>\\u0020"],
                }
            },
            "filter": {
                "persian_stop": {"type": "stop", "stopwords": "_persian_"},
                "persian_stemmer": {"type": "stemmer", "language": "persian"},
            },
            "analyzer": {
                "rebuilt_persian": {
                    "tokenizer": "standard",
                    "char_filter": ["zero_width_spaces"],
                    "filter": [
                        "lowercase",
                        "decimal_digit",
                        "arabic_normalization",
                        "persian_normalization",
                        "persian_stop",
                        "persian_stemmer",
                    ],
                }
            },
        }
    },
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'doctors'})

In [20]:
for i, row in data.iterrows():
    doc = {
        "expertise": row["display_expertise"],
        "gender": row["gender"],
        "experience": row["experience"],
        "title": row["title"],
        "star": row["star"],
        "rates_count": row["rates_count"],
        "number_of_visits": row["number_of_visits"],
        "view": row["view"],
        "insurances": row["insurances"],
        "doctor_encounter": row["doctor_encounter"],
        "explanation_of_issue": row["explanation_of_issue"],
        "quality_of_treatment": row["quality_of_treatment"],
        "comments_count": row["comments_count"],
        "waiting_time": row["waiting_time"],
        "clinic": row["clinic"],
    }

    es.index(index="doctors", id=i, document=doc)

In [21]:
es.indices.refresh(index="doctors")
es.cat.count(index="doctors", format="json")

ListApiResponse([{'epoch': '1725427632', 'timestamp': '05:27:12', 'count': '16296'}])

In [22]:
es.search(index="doctors", q="احمداباد")

ObjectApiResponse({'took': 27, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 209, 'relation': 'eq'}, 'max_score': 6.481121, 'hits': [{'_index': 'doctors', '_id': '10923', '_score': 6.481121, '_source': {'expertise': 'متخصص زنان و زایمان, دکترای حرفه\u200cای پزشکی', 'gender': 'F', 'experience': 43.0, 'title': 'نفیسه ثقفی', 'star': 4.25, 'rates_count': 122.0, 'number_of_visits': 17197.0, 'view': '17K', 'insurances': '[]', 'doctor_encounter': 4.40625, 'explanation_of_issue': 3.939393939393939, 'quality_of_treatment': 4.336633663366337, 'comments_count': 40.0, 'waiting_time': 0.280701754385964, 'clinic': [{'city': 'مشهد', 'number': '8406017, 8402767', 'address': 'محل کار: احمداباد-بیمارستان قایم | مطب: احمداباد-پرستار1 - ساختمان 4', 'province_name': 'خراسان رضوی'}]}}, {'_index': 'doctors', '_id': '12385', '_score': 6.3974333, '_source': {'expertise': 'متخصص بیماری های داخلی, دکترای حرفه ای پزشکی', 'gender': 'M', 'experi

In [23]:
def search_doctors(search_params, index_name="doctors"):
    query = {"bool": {"must": [], "should": [], "filter": []}}
    for field in ["problem", "expertise"]:
        if field in search_params:
            query["bool"]["should"].extend(
                [{"match": {"expertise": e}} for e in search_params[field]]
            )

    if "city" in search_params:
        query["bool"]["filter"].append(
            {"term": {"clinic.city": search_params["city"][0]}}
        )

    if "gender" in search_params:
        query["bool"]["filter"].append({"term": {"gender": search_params["gender"][0]}})
    print(query)

    response = es.search(index=index_name, body={"query": query})

    return response["hits"]["hits"]

In [24]:
# Example usage
search_params = {
    "city": ["مشهد"],
    "gender": ["f"],
    "expertise": ["قلب"],
}

results = search_doctors(search_params)

# Display the results
for result in results:
    print(result["_source"])

len(results)

{'bool': {'must': [], 'should': [{'match': {'expertise': 'قلب'}}], 'filter': [{'term': {'clinic.city': 'مشهد'}}, {'term': {'gender': 'f'}}]}}
{'expertise': 'متخصص قلب وعروق فوق تخصص قلب و عروق', 'gender': 'F', 'experience': None, 'title': 'علی اصغر بلوریان', 'star': 4.0, 'rates_count': 53.0, 'number_of_visits': 5165.0, 'view': '5K', 'insurances': '[]', 'doctor_encounter': 4.3, 'explanation_of_issue': 3.3, 'quality_of_treatment': 4.4, 'comments_count': 6.0, 'waiting_time': 0.0, 'clinic': [{'city': 'تهران', 'number': None, 'address': 'استان تهران، شهر تهران، منطقه 2، شهرک قدس (غرب ، چهارراه پونک باختری، بیمارستان آتیه ،', 'province_name': 'تهران'}, {'city': 'مشهد', 'number': None, 'address': 'استان خراسان رضوی، شهر مشهد، احمدآباد، خ\u200cعارف\u200c', 'province_name': 'خراسان رضوی'}]}
{'expertise': 'متخصص قلب وعروق', 'gender': 'F', 'experience': None, 'title': 'مهدی عمادزاده\u200c', 'star': 5.0, 'rates_count': 28.0, 'number_of_visits': 2614.0, 'view': '2K', 'insurances': '[]', 'doctor_enc

10