### Run in Google Colab

In [None]:
%%capture
!pip install fasttext
!pip install emoji
!pip install translators --upgrade

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Healthcare/")

Mounted at /content/drive


### Import Library and Helper Functions

In [2]:

import os
import csv
import time
import json
import emoji
import fasttext
import numpy as np
import pandas as pd
import string
# import mlflow
import json
import pickle

from datetime import date
import translators as ts

from sklearn.model_selection import train_test_split
from sklearn.metrics import (f1_score,
                             precision_score,
                             recall_score, 
                             accuracy_score)

Using Indonesia server backend.


In [3]:
def remove_newline(sentence: str) -> str:
    """Remove newline."""
    sentence = sentence.replace("\n", "")
    sentence = " ".join(sentence.split())
    return sentence

def remove_underscore(sentence: str) -> str:
    """Remove underscore."""
    sentence = sentence.replace("_", " ")
    sentence = " ".join(sentence.split())
    return sentence

def remove_punctuation(sentence: str) -> str:
    """Remove punctuation from text."""
    sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
    sentence = " ".join(sentence.split())
    return sentence


def clean_nonASCII(sentence: str) -> str:
    """Remove emoji and ascii."""
    clean_sentence = emoji.get_emoji_regexp().sub(u" ", sentence)
    clean_sentence = clean_sentence.encode("ascii", "ignore").decode()
    clean_sentence = [
        x.strip().lower() for x in clean_sentence.split() if x.strip()
    ]
    return " ".join(clean_sentence)

#multiprocessing
from multiprocessing import  Pool
from functools import partial
from os import cpu_count
 
n_cpu = cpu_count()
 
def parallelize(data, func, num_of_processes=n_cpu):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
 
def run_on_subset(func, data_subset):
    return data_subset.progress_apply(func)
 
def parallelize_on_rows(data, func, num_of_processes=n_cpu):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

def process_slug(subs):
    res = subs.lower()
    res = res.replace(' ','-')
    res = res.replace('(','')
    res = res.replace(')','')
    res = res.replace('≥','gte')
    res = res.replace('>','gt')
    res = res.replace('≤','lte')
    res = res.replace('<','lt')
    res = res.replace('/','atau')
    res = res.replace('&','dan')
    res = res.replace(',','')
    res = res.replace('.','')
    
    return res   

def lower(subs):
    res = subs.lower()
    return res 

### Explore Dataset

In [4]:
import pandas as pd

disease_df = pd.read_csv('dataset/disease.csv')
symptom_df = pd.read_csv('dataset/symptom.csv')

In [5]:
# translate symtom (gejala) ke bahasa a indonesia
# lambda x:model.predict(x,k=1)[0][0].replace("__label__","")
symptom_df['Clean Symptom (ID)'] = symptom_df['Clean Symptom'].apply(lambda x:ts.google(x,to_language='id'))

In [6]:
symptom_df

Unnamed: 0,Symptom,weight,Clean Symptom,Clean Symptom (ID)
0,itching,1,itching,gatal
1,skin_rash,3,skin rash,ruam kulit
2,nodal_skin_eruptions,4,nodal skin eruptions,letusan kulit nodal
3,continuous_sneezing,4,continuous sneezing,bersin terus menerus
4,shivering,5,shivering,gemetaran
...,...,...,...,...
128,inflammatory_nails,2,inflammatory nails,Kuku radang
129,blister,4,blister,lepuh
130,red_sore_around_nose,2,red sore around nose,sakit merah di sekitar hidung
131,yellow_crust_ooze,3,yellow crust ooze,Kapten kuning


In [7]:
# mengganti nilai NaN menjadi string " "
disease_df.fillna(" ", inplace=True)

In [8]:
disease_df

Unnamed: 0,Disease,Description,Disease Slug,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,drug-reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,An infectious disease caused by protozoan para...,malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,An allergy is an immune system response to a f...,allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi...",hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,Psoriasis is a common skin disorder that forms...,psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
5,GERD,"Gastroesophageal reflux disease, or GERD, is a...",gerd,avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise
6,Chronic cholestasis,"Chronic cholestatic diseases, whether occurrin...",chronic-cholestasis,cold baths,anti itch medicine,consult doctor,eat healthy
7,hepatitis A,Hepatitis A is a highly contagious liver infec...,hepatitis-a,Consult nearest hospital,wash hands through,avoid fatty spicy food,medication
8,Osteoarthristis,Osteoarthritis is the most common form of arth...,osteoarthristis,acetaminophen,consult nearest hospital,follow up,salt baths
9,(vertigo) Paroymsal Positional Vertigo,Benign paroxysmal positional vertigo (BPPV) is...,vertigo-paroymsal--positional-vertigo,lie down,avoid sudden change in body,avoid abrupt head movment,relax


In [9]:
# translate deskripsi, disease (penyakit), dan semua precaution ke bahasa indonesia
disease_df['Description (ID)'] = disease_df['Description'].apply(lambda x:ts.google(x,to_language='id'))
disease_df['Disease (ID)'] = disease_df['Disease'].apply(lambda x:ts.google(x,to_language='id'))
disease_df['Precaution_1 (ID)'] = disease_df['Precaution_1'].apply(lambda x:ts.google(x,to_language='id'))
disease_df['Precaution_2 (ID)'] = disease_df['Precaution_2'].apply(lambda x:ts.google(x,to_language='id'))
disease_df['Precaution_3 (ID)'] = disease_df['Precaution_3'].apply(lambda x:ts.google(x,to_language='id'))
disease_df['Precaution_4 (ID)'] = disease_df['Precaution_4'].apply(lambda x:ts.google(x,to_language='id'))

In [10]:
disease_df.head()

Unnamed: 0,Disease,Description,Disease Slug,Precaution_1,Precaution_2,Precaution_3,Precaution_4,Description (ID),Disease (ID),Precaution_1 (ID),Precaution_2 (ID),Precaution_3 (ID),Precaution_4 (ID)
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,drug-reaction,stop irritation,consult nearest hospital,stop taking drug,follow up,Reaksi obat yang merugikan (ADR) adalah cedera...,Reaksi obat,Hentikan iritasi,Konsultasikan dengan rumah sakit terdekat,berhenti minum obat,menindaklanjuti
1,Malaria,An infectious disease caused by protozoan para...,malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out,Penyakit menular yang disebabkan oleh parasit ...,Malaria,Konsultasikan dengan rumah sakit terdekat,Hindari makanan berminyak,Hindari Makanan Non Veget,Jauhkan nyamuk
2,Allergy,An allergy is an immune system response to a f...,allergy,apply calamine,cover area with bandage,,use ice to compress itching,Alergi adalah respons sistem kekebalan tubuh t...,Alergi,Terapkan Calamine,Area penutup dengan perban,,Gunakan es untuk mengompres gatal
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi...",hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep,"Hipotiroidisme, juga disebut tiroid yang kuran...",Hipotiroidisme,mengurangi stres,latihan,makan yang sehat,tidur nyenyak
4,Psoriasis,Psoriasis is a common skin disorder that forms...,psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths,Psoriasis adalah kelainan kulit umum yang memb...,Psoriasis,Cuci tangan dengan air sabun hangat,Hentikan Pendarahan Menggunakan Tekanan,Konsultasikan dengan dokter,mandi garam


In [11]:
symptom_df.to_csv('dataset/symptom_id.csv',index=False)
disease_df.to_csv('dataset/disease_id.csv',index=False)