In [None]:
import numpy as np
import pandas as pd
import swifter  # noqa: F401
from swifter import set_defaults

set_defaults(
    scheduler="processes",
    progress_bar=True,
    allow_dask_on_strings=True,
    force_parallel=True,
)

names = pd.read_csv("../data/raw/names.csv")
data = pd.read_json("../data/raw/doctors.json")

In [None]:
no_name = data[data["name"].isna()]
s_name = no_name["title"].str.split(" ", n=1)
f_name = s_name.str[0]
no_name["name"] = f_name

In [None]:
data.loc[no_name.index, "name"] = no_name["name"]

In [None]:
names["Gender"] = names["Gender"].replace("پسر", "M")
names["Gender"] = names["Gender"].replace("دختر", "F")
names.head()

In [None]:
import re


def find_gender(data_name):
    escaped_name = re.escape(data_name)
    match = names[names["Name"].str.contains(escaped_name, regex=False)]
    if not match.empty:
        return match["Gender"].iloc[0]
    else:
        return None


data["gender"] = data["name"].apply(find_gender)

In [None]:
import requests
from bs4 import BeautifulSoup

css_selector = ".PlasmicAbout_text__dzfix__IBxqf > div:nth-child(1)"


def get_about(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    about = soup.select(css_selector)
    return " ".join([item.text for item in about[0].contents]) if about else None


data["about"] = data["url"].swifter.apply(get_about)

In [None]:
cent = data["centers"]
clinic = []
for c in cent:
    temp_list = []
    for item in c:
        temp = {}
        temp["city"] = item["city_name"]
        temp["number"] = item["display_number"]
        temp["address"] = item["address"]
        temp["province_name"] = item["province_name"]
        temp["lat"] = item["map"]["lat"]
        temp["long"] = item["map"]["lon"]
        temp_list.append(temp)
    clinic.append(temp_list)
data["clinic"] = clinic

In [None]:
rate_info = data["rate_info"]
doctor_encounter = []
explanation_of_issue = []
quality_of_treatment = []
waiting_time = []
comments_count = []

for item in rate_info:
    if isinstance(item, dict):
        # Append values or None if the key doesn't exist
        doctor_encounter.append(item.get("doctor_encounter", None))
        explanation_of_issue.append(item.get("explanation_of_issue", None))
        quality_of_treatment.append(item.get("quality_of_treatment", None))
        waiting_time.append(item.get("waiting_time", None))
        comments_count.append(item.get("comments_count", None))
    else:
        # Append None if the item is not a dictionary
        doctor_encounter.append(None)
        explanation_of_issue.append(None)
        quality_of_treatment.append(None)
        waiting_time.append(None)
        comments_count.append(None)

data["doctor_encounter"] = doctor_encounter
data["explanation_of_issue"] = explanation_of_issue
data["quality_of_treatment"] = quality_of_treatment
data["waiting_time"] = waiting_time
data["comments_count"] = comments_count
data = data.drop(labels="rate_info", axis=1)

In [None]:
base_dataset = data[
    [
        "title",
        "display_expertise",
        "gender",
        "star",
        "rates_count",
        "number_of_visits",
        "view",
        "insurances",
        "experience",
        "doctor_encounter",
        "explanation_of_issue",
        "quality_of_treatment",
        "waiting_time",
        "comments_count",
        "medical_code",
        "clinic",
        "image",
        "url",
        # "about",
    ]
]

In [None]:
base_dataset = base_dataset.drop_duplicates(
    subset=["title", "medical_code"], keep="first"
)

In [None]:
base_dataset = base_dataset.replace(np.nan, None)

In [None]:
base_dataset.to_json("../data/processed/base_dataset.json")
base_dataset.to_csv("../data/processed/base_dataset.csv")