### For Run in Google Colab

In [None]:
%%capture
!pip install fasttext
!pip install emoji

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Healthcare/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Helper Function

In [1]:
# import all library and helper function
import os
import csv
import time
import json
import emoji
import fasttext
import numpy as np
import pandas as pd
import string
# import mlflow
import json
import pickle

from datetime import date

from sklearn.model_selection import train_test_split
from sklearn.metrics import (f1_score,
                             precision_score,
                             recall_score, 
                             accuracy_score)

In [2]:
# helper function for text processing
def remove_newline(sentence: str) -> str:
    """Remove newline."""
    sentence = sentence.replace("\n", "")
    sentence = " ".join(sentence.split())
    return sentence

def remove_underscore(sentence: str) -> str:
    """Remove underscore."""
    sentence = sentence.replace("_", " ")
    sentence = " ".join(sentence.split())
    return sentence

def remove_punctuation(sentence: str) -> str:
    """Remove punctuation from text."""
    sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
    sentence = " ".join(sentence.split())
    return sentence


def clean_nonASCII(sentence: str) -> str:
    """Remove emoji and ascii."""
    clean_sentence = emoji.get_emoji_regexp().sub(u" ", sentence)
    clean_sentence = clean_sentence.encode("ascii", "ignore").decode()
    clean_sentence = [
        x.strip().lower() for x in clean_sentence.split() if x.strip()
    ]
    return " ".join(clean_sentence)

#multiprocessing
from multiprocessing import  Pool
from functools import partial
from os import cpu_count
 
n_cpu = cpu_count()
 
def parallelize(data, func, num_of_processes=n_cpu):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
 
def run_on_subset(func, data_subset):
    return data_subset.progress_apply(func)
 
def parallelize_on_rows(data, func, num_of_processes=n_cpu):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

def process_slug(subs):
    res = subs.lower()
    res = res.replace(' ','-')
    res = res.replace('(','')
    res = res.replace(')','')
    res = res.replace('≥','gte')
    res = res.replace('>','gt')
    res = res.replace('≤','lte')
    res = res.replace('<','lt')
    res = res.replace('/','atau')
    res = res.replace('&','dan')
    res = res.replace(',','')
    res = res.replace('.','')
    
    return res   

def lower(subs):
    res = subs.lower()
    return res 

### Explore Dataset

In [1]:
# Check the given dataset

dataset_df = pd.read_csv('dataset/dataset.csv')
symptom_desc_df = pd.read_csv('dataset/symptom_Description.csv')
symptom_prec_df = pd.read_csv('dataset/symptom_precaution.csv')
symptom_severity_df = pd.read_csv('dataset/Symptom-severity.csv')

In [3]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB


In [10]:
dataset_df['Disease'].unique() # Check the disease (label untuk modeling)

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [11]:
len(dataset_df['Disease'].unique()) # jumlah label (multiclass classification)

41

In [None]:
dataset_df['Disease'] = dataset_df['Disease'].replace('Diabetes ','Diabetes') # remove whitespace pada label Diabetes

In [4]:
dataset_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [7]:
# replace NaN value with empty string " "
dataset_df.fillna(" ", inplace=True)

# create 1 new column contains all the symptoms regarding to the disease (used for training)
dataset_df['TEXT'] = (dataset_df['Symptom_1'] 
+ dataset_df['Symptom_2'] 
+ dataset_df['Symptom_3']
+ dataset_df['Symptom_4'] 
+ dataset_df['Symptom_5']
+ dataset_df['Symptom_6']
+ dataset_df['Symptom_7']
+ dataset_df['Symptom_8']
+ dataset_df['Symptom_9']
+ dataset_df['Symptom_10']
+ dataset_df['Symptom_11']
+ dataset_df['Symptom_12'] 
+ dataset_df['Symptom_13']
+ dataset_df['Symptom_14'] 
+ dataset_df['Symptom_15']
+ dataset_df['Symptom_16']
+ dataset_df['Symptom_17'])

In [8]:
dataset_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,TEXT
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,itching skin_rash nodal_skin_eruptions dischro...
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,skin_rash nodal_skin_eruptions dischromic _pa...
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,itching nodal_skin_eruptions dischromic _patch...
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,itching skin_rash dischromic _patches ...
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,itching skin_rash nodal_skin_eruptions ...


In [6]:
# check the symptom description dataframe
symptom_desc_df.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [9]:
symptom_desc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      41 non-null     object
 1   Description  41 non-null     object
dtypes: object(2)
memory usage: 784.0+ bytes


In [7]:
symptom_desc_df['Disease'].unique()

array(['Drug Reaction', 'Malaria', 'Allergy', 'Hypothyroidism',
       'Psoriasis', 'GERD', 'Chronic cholestasis', 'hepatitis A',
       'Osteoarthristis', '(vertigo) Paroymsal  Positional Vertigo',
       'Hypoglycemia', 'Acne', 'Diabetes', 'Impetigo', 'Hypertension',
       'Peptic ulcer diseae', 'Dimorphic hemorrhoids(piles)',
       'Common Cold', 'Chicken pox', 'Cervical spondylosis',
       'Hyperthyroidism', 'Urinary tract infection', 'Varicose veins',
       'AIDS', 'Paralysis (brain hemorrhage)', 'Typhoid', 'Hepatitis B',
       'Fungal infection', 'Hepatitis C', 'Migraine', 'Bronchial Asthma',
       'Alcoholic hepatitis', 'Jaundice', 'Hepatitis E', 'Dengue',
       'Hepatitis D', 'Heart attack', 'Pneumonia', 'Arthritis',
       'Gastroenteritis', 'Tuberculosis'], dtype=object)

In [8]:
len(symptom_desc_df['Disease'].unique()) # memiliki jumlah label yang sama pada dataset dataframe

41

In [12]:
# check symptom precaution dataset
symptom_prec_df.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [14]:
symptom_prec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       41 non-null     object
 1   Precaution_1  41 non-null     object
 2   Precaution_2  41 non-null     object
 3   Precaution_3  40 non-null     object
 4   Precaution_4  40 non-null     object
dtypes: object(5)
memory usage: 1.7+ KB


In [13]:
symptom_prec_df['Disease'].unique()

array(['Drug Reaction', 'Malaria', 'Allergy', 'Hypothyroidism',
       'Psoriasis', 'GERD', 'Chronic cholestasis', 'hepatitis A',
       'Osteoarthristis', '(vertigo) Paroymsal  Positional Vertigo',
       'Hypoglycemia', 'Acne', 'Diabetes ', 'Impetigo', 'Hypertension ',
       'Peptic ulcer diseae', 'Dimorphic hemmorhoids(piles)',
       'Common Cold', 'Chicken pox', 'Cervical spondylosis',
       'Hyperthyroidism', 'Urinary tract infection', 'Varicose veins',
       'AIDS', 'Paralysis (brain hemorrhage)', 'Typhoid', 'Hepatitis B',
       'Fungal infection', 'Hepatitis C', 'Migraine', 'Bronchial Asthma',
       'Alcoholic hepatitis', 'Jaundice', 'Hepatitis E', 'Dengue',
       'Hepatitis D', 'Heart attack', 'Pneumonia', 'Arthritis',
       'Gastroenteritis', 'Tuberculosis'], dtype=object)

In [15]:
len(symptom_prec_df['Disease'].unique())

41

In [16]:
# Check symptom severity df
symptom_severity_df.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [11]:
len(symptom_severity_df)

133

In [12]:
symptom_severity_df[symptom_severity_df['weight']==symptom_severity_df['weight'].max()]

Unnamed: 0,Symptom,weight
25,high_fever,7
46,swelling_of_stomach,7
56,chest_pain,7
57,weakness_in_limbs,7
113,coma,7


#### Cleaning the dataset

In [17]:
symptom_severity_df['Clean Symptom'] = symptom_severity_df['Symptom'].apply(remove_underscore)

symptom_severity_df.to_csv("dataset/symptom.csv", index=False)

In [14]:
# apply sluging to label (replace whitespace to '-')
symptom_desc_df['Disease Slug'] = symptom_desc_df['Disease'].apply(process_slug)
symptom_prec_df['Disease Slug'] = symptom_prec_df['Disease'].apply(process_slug)

In [18]:
symptom_desc_df.to_csv("dataset/symptom_desc.csv", index=False)
symptom_prec_df.to_csv("dataset/symptom_prec.csv", index=False)

In [19]:
# merge symtom description and precaution into one dataframe
symptom_df = pd.merge(
    symptom_desc_df, symptom_prec_df, on=["Disease", "Disease Slug"]
)
symptom_df.to_csv('dataset/disease.csv',index=False)

### Prepare Dataset

In [20]:
# preprocessing the text for modeling
dataset_df["TEXT"] = dataset_df["TEXT"].apply(remove_underscore)
dataset_df["TEXT"] = dataset_df["TEXT"].apply(lower)
dataset_df["TEXT"] = dataset_df["TEXT"].apply(remove_punctuation)
dataset_df["TEXT"] = dataset_df["TEXT"].apply(remove_newline)
dataset_df["TEXT"] = dataset_df["TEXT"].apply(clean_nonASCII)

dataset_df['LABEL'] = dataset_df["Disease"].apply(process_slug)

dataset_df = dataset_df[["TEXT", "LABEL"]]



In [21]:
# split dataset for training and testing
train_dataset, test_dataset = train_test_split(
                                          dataset_df,
                                          test_size=0.3,
                                          stratify=dataset_df["LABEL"],
                                          random_state = 42
    )

In [22]:
train_dataset

Unnamed: 0,TEXT,LABEL
2585,stomach pain acidity ulcers on tongue vomiting...,gerd
1917,chills fatigue cough high fever sweating malai...,pneumonia
1180,vomiting headache nausea spinning movements lo...,vertigo-paroymsal--positional-vertigo
72,weight loss restlessness lethargy irregular su...,diabetes
1601,skin rash pus filled pimples blackheads scurring,acne
...,...,...
1288,vomiting indigestion loss of appetite abdomina...,peptic-ulcer-diseae
2205,chills vomiting sweating headache nausea diarr...,malaria
2499,skin rash joint pain skin peeling silver like ...,psoriasis
4697,joint pain vomiting fatigue high fever yellowi...,hepatitis-e


In [29]:
# save dataset for training and testing
if not os.path.exists('./Dataset/'):
  os.makedirs('./Dataset/')

train_dataset.to_csv('./Dataset/train_dataset.csv', index=False)
test_dataset.to_csv('./Dataset/test_dataset.csv', index=False)

In [28]:
test_dataset

Unnamed: 0,TEXT,LABEL
4706,fatigue mood swings weight loss restlessness s...,hyperthyroidism
1812,skin rash chills joint pain vomiting fatigue h...,dengue
3876,joint pain vomiting fatigue yellowish skin dar...,hepatitis-d
4125,chills vomiting fatigue weight loss cough high...,tuberculosis
3361,skin rash high fever blister red sore around n...,impetigo
...,...,...
3855,continuous sneezing shivering chills watering ...,allergy
1122,cramps bruising obesity swollen legs swollen b...,varicose-veins
1914,chills cough high fever breathlessness sweatin...,pneumonia
4667,joint pain neck pain knee pain hip joint pain ...,osteoarthristis


In [27]:
# create dataset for training with fasttext (non tensorflow)
import os

trainset_path = './Dataset/train_fasttext.txt'

X_train = train_dataset["TEXT"]
y_train = train_dataset["LABEL"]


with open(trainset_path, "w") as f:
  for x, y in zip(X_train, y_train):
    f.write('__label__'+str(y)+' '+x); f.write('\n')