In [60]:
import pandas as pd
import json

# Load the JSON data.
with open("../data/util/json/autocomplete_mappings.json", "r") as file:
    search_results = json.load(file)

# Create lists to hold data.
queries = []
completions = []

# Iterate over the search results and extract queries and completions.
for completion, query_list in search_results.items():
    for query in query_list:
        queries.append(query)
        completions.append(completion)

# Create a DataFrame
data = {"query": queries, "completion": completions}
df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [61]:
df.head()

Unnamed: 0,query,completion
0,ENT care center,Otorhinolaryngological care
1,health check,General medical care
2,reproductive health,Reproductive medicine
3,ear nose throat checkup,Otorhinolaryngological care
4,radiological diagnostics,Radiological diagnostics


In [62]:
df["completion"].value_counts()

completion
Nuclear medicine                        28
Internal medicine care                  28
Infectious disease care                 27
Laboratory services                     26
Specialized clinics                     25
Surgical care                           25
Reproductive medicine                   24
Dental care                             21
Otorhinolaryngological care             20
Clinical pharmacology                   20
Pediatric care                          20
Emergency medical care                  20
Occupational and sports medicine        20
General medical care                    20
Physical medicine and rehabilitation    20
Radiological diagnostics                20
Psychological and psychiatric care      20
Ophthalmological care                   19
Oncological care                        18
Women's health                          18
Name: count, dtype: int64

In [63]:
len(df)

439

In [64]:
alphabet = "abcdefghijklmnopqrstuvwxyz"
one_hot_encoded = pd.DataFrame(columns=[ord(char) for char in list(alphabet)])
one_hot_encoded.head()

Unnamed: 0,97,98,99,100,101,102,103,104,105,106,...,113,114,115,116,117,118,119,120,121,122


In [65]:
from collections import Counter
for idx, query in enumerate(df["query"]):
    counter = Counter(query)
    encoded_query = [counter[char] for char in alphabet]
    one_hot_encoded.loc[idx] = encoded_query

In [66]:
one_hot_encoded["completion"] = df["completion"]
df = one_hot_encoded
df.head()

Unnamed: 0,97,98,99,100,101,102,103,104,105,106,...,114,115,116,117,118,119,120,121,122,completion
0,1,0,2,0,3,0,0,0,0,0,...,2,0,1,0,0,0,0,0,0,Otorhinolaryngological care
1,1,0,2,0,2,0,0,3,0,0,...,0,0,1,0,0,0,0,0,0,General medical care
2,1,0,1,1,3,0,0,2,1,0,...,2,0,2,1,1,0,0,0,0,Reproductive medicine
3,2,0,2,0,3,0,0,2,0,0,...,2,1,2,1,0,0,0,0,0,Otorhinolaryngological care
4,3,0,2,2,0,0,2,0,4,0,...,1,2,1,0,0,0,0,0,0,Radiological diagnostics


In [67]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["completion"] = label_encoder.fit_transform(df["completion"])

In [68]:
df.to_csv("../data/processed/completions_encoded.csv", index=False)

In [69]:
df.head()

Unnamed: 0,97,98,99,100,101,102,103,104,105,106,...,114,115,116,117,118,119,120,121,122,completion
0,1,0,2,0,3,0,0,0,0,0,...,2,0,1,0,0,0,0,0,0,11
1,1,0,2,0,2,0,0,3,0,0,...,0,0,1,0,0,0,0,0,0,3
2,1,0,1,1,3,0,0,2,1,0,...,2,0,2,1,1,0,0,0,0,16
3,2,0,2,0,3,0,0,2,0,0,...,2,1,2,1,0,0,0,0,0,11
4,3,0,2,2,0,0,2,0,4,0,...,1,2,1,0,0,0,0,0,0,15


In [70]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping

{'Clinical pharmacology': 0,
 'Dental care': 1,
 'Emergency medical care': 2,
 'General medical care': 3,
 'Infectious disease care': 4,
 'Internal medicine care': 5,
 'Laboratory services': 6,
 'Nuclear medicine': 7,
 'Occupational and sports medicine': 8,
 'Oncological care': 9,
 'Ophthalmological care': 10,
 'Otorhinolaryngological care': 11,
 'Pediatric care': 12,
 'Physical medicine and rehabilitation': 13,
 'Psychological and psychiatric care': 14,
 'Radiological diagnostics': 15,
 'Reproductive medicine': 16,
 'Specialized clinics': 17,
 'Surgical care': 18,
 "Women's health": 19}

In [71]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [72]:
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

In [73]:
cpp_train_features = train_df.drop(["completion"], axis=1)
cpp_train_features.reset_index(drop=True, inplace=True)

cpp_train_labels = train_df["completion"]
cpp_train_labels.reset_index(drop=True, inplace=True)

In [74]:
cpp_train_features.head()

Unnamed: 0,97,98,99,100,101,102,103,104,105,106,...,113,114,115,116,117,118,119,120,121,122
0,0,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
1,1,0,2,1,4,0,1,0,2,0,...,0,1,1,1,0,0,0,0,1,0
2,2,0,2,0,3,0,0,2,2,0,...,0,2,3,1,0,1,0,0,2,0
3,1,0,1,1,1,0,1,0,2,0,...,0,0,3,3,0,0,0,0,0,0
4,3,0,2,0,3,0,0,2,1,0,...,0,1,1,2,0,0,0,0,1,0


In [75]:
cpp_train_labels.head()

0     5
1     2
2    13
3    15
4    17
Name: completion, dtype: int32

In [76]:
cpp_train_features.to_csv("../data/processed/cpp_train_features.csv")
cpp_train_labels.to_csv("../data/processed/cpp_train_labels.csv")