### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np

### Importing the Dataset

In [2]:
data = pd.read_csv('Health_dataset.csv', header= None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
1,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
2,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,


### Cleaning the Dataset

In [4]:
# Removing the first row
data.drop(index = 0, axis = 0, inplace = True)

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
1,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
2,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
5,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [6]:
data.shape

(4920, 18)

In [7]:
# Creating the symptom dataset in which only symptoms are present
symptom_df = data.iloc[:, 1:]

In [8]:
symptom_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
1,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
2,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
4,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
5,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [9]:
# Resetting the index
symptom_df.reset_index(inplace = True, drop = True)

In [10]:
symptom_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [11]:
symptom_df.shape

(4920, 17)

In [12]:
# Creating the list of all symptoms for each disease
symptom = []
for i in range(0,4920):
    symptom.append([str(symptom_df.values[i,j]) for j in range(0,17)])

In [14]:
symptom[0]

['itching',
 ' skin_rash',
 ' nodal_skin_eruptions',
 ' dischromic _patches',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan']

In [15]:
# Getting the unique symptom names
column = pd.unique(symptom_df.iloc[:,:].values.ravel('K'))

In [16]:
column

array(['itching', ' skin_rash', ' continuous_sneezing', ' shivering',
       ' stomach_pain', ' acidity', ' vomiting', ' indigestion',
       ' muscle_wasting', ' patches_in_throat', ' fatigue',
       ' weight_loss', ' sunken_eyes', ' cough', ' headache',
       ' chest_pain', ' back_pain', ' weakness_in_limbs', ' chills',
       ' joint_pain', ' yellowish_skin', ' constipation',
       ' pain_during_bowel_movements', ' breathlessness', ' cramps',
       ' weight_gain', ' mood_swings', ' neck_pain', ' muscle_weakness',
       ' stiff_neck', ' pus_filled_pimples', ' burning_micturition',
       ' bladder_discomfort', ' high_fever', ' nodal_skin_eruptions',
       ' ulcers_on_tongue', ' loss_of_appetite', ' restlessness',
       ' dehydration', ' dizziness', ' weakness_of_one_body_side',
       ' lethargy', ' nausea', ' abdominal_pain', ' pain_in_anal_region',
       ' sweating', ' bruising', ' cold_hands_and_feets', ' anxiety',
       ' knee_pain', ' swelling_joints', ' blackheads',
  

In [17]:
column.shape

(132,)

In [18]:
# Creating a new dataframe having symptom names as columns
new_df = pd.DataFrame(columns = column)
new_df.head()

Unnamed: 0,itching,skin_rash,continuous_sneezing,shivering,stomach_pain,acidity,vomiting,indigestion,muscle_wasting,patches_in_throat,...,abnormal_menstruation,receiving_unsterile_injections,coma,sinus_pressure,palpitations,stomach_bleeding,runny_nose,congestion,blood_in_sputum,loss_of_smell


In [19]:
# Creating the encoded list of arrays for each symptom
new_list = np.zeros((4920,132), dtype = int)

for row in range(0,4920):
    for col in range(0,132):
        for sym in range(0,17):
            if column[col] == symptom[row][sym]:
                new_list[row][col] = 1

In [20]:
new_list.shape

(4920, 132)

In [21]:
new_list

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [22]:
# Encoding this array to the new_df dataframe of symptoms
for i in range(len(new_list)):
    new_df.loc[i] = new_list[i]

In [23]:
new_df.head()

Unnamed: 0,itching,skin_rash,continuous_sneezing,shivering,stomach_pain,acidity,vomiting,indigestion,muscle_wasting,patches_in_throat,...,abnormal_menstruation,receiving_unsterile_injections,coma,sinus_pressure,palpitations,stomach_bleeding,runny_nose,congestion,blood_in_sputum,loss_of_smell
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# We need to add disease name to it
disease_df = pd.read_csv('Health_dataset.csv')

In [25]:
disease = disease_df['Disease']

In [26]:
disease.head()

0    Fungal infection
1    Fungal infection
2    Fungal infection
3    Fungal infection
4    Fungal infection
Name: Disease, dtype: object

In [27]:
disease.shape

(4920,)

In [28]:
new_df.shape

(4920, 132)

In [29]:
# Concatenating the Disease column with the previous dataframe
disease_concat_df = pd.concat([disease,new_df], axis=1)
disease_concat_df.head()

Unnamed: 0,Disease,itching,skin_rash,continuous_sneezing,shivering,stomach_pain,acidity,vomiting,indigestion,muscle_wasting,...,abnormal_menstruation,receiving_unsterile_injections,coma,sinus_pressure,palpitations,stomach_bleeding,runny_nose,congestion,blood_in_sputum,loss_of_smell
0,Fungal infection,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Saving this new dataframe
disease_concat_df.to_csv("./Final Dataset/final_healthdata.csv", index=False)