In [5]:
import numpy as np
import pandas as pd

names = pd.read_csv("../data/raw/names.csv")
data = pd.read_json("../data/raw/new_doctors.json")

In [6]:
data = data.drop_duplicates(
    subset=["title", "medical_code"], keep="first"
)

In [None]:
no_name = data[data["name"].isna()]
s_name = no_name["title"].str.split(" ", n=1)
f_name = s_name.str[0]
no_name["name"] = f_name

In [9]:
data.loc[no_name.index, "name"] = no_name["name"]

In [None]:
names["Gender"] = names["Gender"].replace("پسر", "M")
names["Gender"] = names["Gender"].replace("دختر", "F")
names.head()

In [11]:
import re


def find_gender(data_name):
    escaped_name = re.escape(data_name)
    match = names[names["Name"].str.contains(escaped_name, regex=False)]
    if not match.empty:
        return match["Gender"].iloc[0]
    else:
        return None


data["gender"] = data["name"].apply(find_gender)

In [12]:
cent = data["centers"]
clinic = []
for c in cent:
    temp_list = []
    for item in c:
        temp = {}
        temp["city"] = item["city_name"]
        temp["number"] = item["display_number"]
        temp["address"] = item["address"]
        temp["province_name"] = item["province_name"]
        temp["lat"] = item["map"]["lat"]
        temp["long"] = item["map"]["lon"]
        temp_list.append(temp)
    clinic.append(temp_list)
data["clinic"] = clinic

In [13]:
import json

symptomes_add = "../elastic/symptomes.json"
with open(symptomes_add) as symptomes_file:
  file_contents = symptomes_file.read()

symptomes_data = json.loads(file_contents)

In [14]:
# Initialize the necessary lists
doctor_encounter = []
explanation_of_issue = []
quality_of_treatment = []
waiting_time = []
comments_count = []
symptomes = []
online_status = []

# Iterate over the rows of the DataFrame once
for i, row in data.iterrows():
    # First block: Processing 'rate_info'
    rate_info = row["rate_info"]
    if isinstance(rate_info, dict):
        doctor_encounter.append(rate_info.get("doctor_encounter", None))
        explanation_of_issue.append(rate_info.get("explanation_of_issue", None))
        quality_of_treatment.append(rate_info.get("quality_of_treatment", None))
        waiting_time.append(rate_info.get("waiting_time", None))
        comments_count.append(rate_info.get("comments_count", None))
    else:
        doctor_encounter.append(None)
        explanation_of_issue.append(None)
        quality_of_treatment.append(None)
        waiting_time.append(None)
        comments_count.append(None)

    # Second block: Processing 'expertise' and 'symptomes_data'
    temp_symptomes = []
    flag_symptomes = False
    for item in row["expertise"]:
        for s in symptomes_data:
            if s in item:
                temp_symptomes.extend(symptomes_data[s])
                flag_symptomes = True
    
    symptomes.append(list(set(temp_symptomes)) if flag_symptomes else None)

    # Third block: Processing 'actions' for 'online_status'
    flag_online = False
    for action in row["actions"]:
        if action.get("title") == "نوبت دهی اینترنتی":
            flag_online = True
            break
    online_status.append(flag_online)

# Add the new columns to the DataFrame
data["doctor_encounter"] = doctor_encounter
data["explanation_of_issue"] = explanation_of_issue
data["quality_of_treatment"] = quality_of_treatment
data["waiting_time"] = waiting_time
data["comments_count"] = comments_count
data["symptomes"] = symptomes
data["online_status"] = online_status

# Drop the 'rate_info' column as in the original first block
data = data.drop(labels="rate_info", axis=1)


In [None]:
data["waiting_time"][(data["waiting_time"] == 0) | (data["waiting_time"] == 1000)] = (
    None
)

In [16]:
base_dataset = data[
    [
        "title",
        "display_expertise",
        "gender",
        "star",
        "rates_count",
        "number_of_visits",
        "view",
        "insurances",
        "experience",
        "doctor_encounter",
        "explanation_of_issue",
        "quality_of_treatment",
        "waiting_time",
        "comments_count",
        "medical_code",
        "clinic",
        "image",
        "url",
        "presence_freeturn",
        "symptomes",
        "online_status"
    ]
]

In [17]:
base_dataset = base_dataset.replace(np.nan, None)

In [19]:
# base_dataset.to_json("../data/processed/base_dataset.json")
base_dataset.to_csv("../data/processed/new_dataset.csv")