In [7]:
import pandas as pd
import json

# Load the JSON data.
with open("../data/util/json/autocomplete_mappings.json", "r") as file:
    search_results = json.load(file)

# Create lists to hold data.
queries = []
completions = []

# Iterate over the search results and extract queries and completions.
for completion, query_list in search_results.items():
    for query in query_list:
        queries.append(query)
        completions.append(completion)

# Create a DataFrame
data = {"query": queries, "completion": completions}
df = pd.DataFrame(data)
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,query,completion
0,specialty healthcare,Specialized clinics
1,pharmacology options,Clinical pharmacology
2,clinical pharmacology,Clinical pharmacology
3,pathology lab,Laboratory services
4,rehabilitation center,Physical medicine and rehabilitation


In [9]:
df["completion"].value_counts()

completion
Nuclear medicine                        28
Internal medicine care                  28
Infectious disease care                 27
Laboratory services                     26
Specialized clinics                     25
Surgical care                           25
Reproductive medicine                   24
Dental care                             21
General medical care                    20
Occupational and sports medicine        20
Clinical pharmacology                   20
Psychological and psychiatric care      20
Radiological diagnostics                20
Physical medicine and rehabilitation    20
Pediatric care                          20
Otorhinolaryngological care             20
Emergency medical care                  20
Ophthalmological care                   19
Women's health                          18
Oncological care                        18
Name: count, dtype: int64

In [10]:
len(df)

439

In [5]:
#max_length = df["query"].str.len().max()

# Apply transformation to pad each string with zeros on the right side.
#df["padded_query"] = df["query"].apply(lambda x: x.ljust(max_length, "0"))

In [13]:
alphabet = "abcdefghijklmnopqrstuvwxyz"
one_hot_encoded = pd.DataFrame(columns=[ord(char) for char in list(alphabet)])
one_hot_encoded.head()

Unnamed: 0,97,98,99,100,101,102,103,104,105,106,...,113,114,115,116,117,118,119,120,121,122


In [14]:
for idx, query in enumerate(df["query"]):
    encoded_query = [1 if char in query else 0 for char in alphabet]
    one_hot_encoded.loc[idx] = encoded_query

In [16]:
one_hot_encoded["completion"] = df["completion"]
df = one_hot_encoded
df.head()

Unnamed: 0,97,98,99,100,101,102,103,104,105,106,...,114,115,116,117,118,119,120,121,122,completion
0,1,0,1,0,1,0,0,1,1,0,...,1,1,1,0,0,0,0,1,0,Specialized clinics
1,1,0,1,0,0,0,1,1,1,0,...,1,1,1,0,0,0,0,1,0,Clinical pharmacology
2,1,0,1,0,0,0,1,1,1,0,...,1,0,0,0,0,0,0,1,0,Clinical pharmacology
3,1,1,0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,1,0,Laboratory services
4,1,1,1,0,1,0,0,1,1,0,...,1,0,1,0,0,0,0,0,0,Physical medicine and rehabilitation


In [7]:
#df = df.drop(["query", "padded_query"], axis=1)

In [17]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["completion"] = label_encoder.fit_transform(df["completion"])

In [19]:
df.to_csv("../data/processed/completions_encoded.csv", index=False)

In [12]:
#one_hot_encoded = df["encoded_query"].str.get_dummies(",")

In [13]:
#one_hot_encoded["completion"] = df["completion"]

In [14]:
#df = one_hot_encoded
#df.columns = df.columns.astype(str)

In [15]:
#df.head()

Unnamed: 0,100,101,102,103,104,105,107,108,109,110,...,122,32,48,69,78,84,97,98,99,completion
0,0,1,0,1,1,1,0,1,1,1,...,0,1,1,0,0,0,1,0,1,11
1,1,1,0,0,1,0,1,1,0,1,...,0,1,1,0,0,0,1,0,1,1
2,0,1,0,0,1,0,0,0,1,1,...,0,1,1,0,0,0,1,0,0,11
3,1,1,0,0,0,1,0,0,0,1,...,0,1,1,0,0,0,1,0,1,12
4,1,1,0,1,0,1,0,0,1,1,...,0,1,1,0,0,0,1,0,1,15


In [20]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping

{'Clinical pharmacology': 0,
 'Dental care': 1,
 'Emergency medical care': 2,
 'General medical care': 3,
 'Infectious disease care': 4,
 'Internal medicine care': 5,
 'Laboratory services': 6,
 'Nuclear medicine': 7,
 'Occupational and sports medicine': 8,
 'Oncological care': 9,
 'Ophthalmological care': 10,
 'Otorhinolaryngological care': 11,
 'Pediatric care': 12,
 'Physical medicine and rehabilitation': 13,
 'Psychological and psychiatric care': 14,
 'Radiological diagnostics': 15,
 'Reproductive medicine': 16,
 'Specialized clinics': 17,
 'Surgical care': 18,
 "Women's health": 19}

In [21]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [22]:
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)