In [91]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
data = pd.read_csv('./symptoms-disease-dataset.csv')

# Remove spaces & convert to lowercase
data.columns = data.columns.str.replace(" ", "_").str.lower()
data.head()


Unnamed: 0,disease,fever,cough,fatigue,difficulty_breathing,age,gender,blood_pressure,cholesterol_level,outcome_variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [92]:
# Remove duplicate rows
data.drop_duplicates(inplace=True)
print(f"Remaining duplicates: {data.duplicated().sum()}")
print(data.dtypes)

Remaining duplicates: 0
disease                 object
fever                   object
cough                   object
fatigue                 object
difficulty_breathing    object
age                      int64
gender                  object
blood_pressure          object
cholesterol_level       object
outcome_variable        object
dtype: object


In [93]:
# Convert the data points to numerical values
# Convert Yes/No to 1/0
binary_cols = ["fever", "cough", "fatigue", "difficulty_breathing"]
for col in binary_cols:
    data[col] = data[col].map({"Yes": 1, "No": 0})

# convert Gender
data["gender"] = data["gender"].map({"Male": 1, "Female": 0})

# convert Blood Pressure
data["blood_pressure"] = data["blood_pressure"].map({"Low": 0, "Normal": 1, "High": 2})

# convert Cholesterol Level
data["cholesterol_level"] = data["cholesterol_level"].map({"Low": 0, "Normal": 1, "High": 2})

# convert Outcome Variable
data["outcome_variable"] = data["outcome_variable"].map({"Positive": 1, "Negative": 0})

# Verify changes
data.head()


Unnamed: 0,disease,fever,cough,fatigue,difficulty_breathing,age,gender,blood_pressure,cholesterol_level,outcome_variable
0,Influenza,1,0,1,1,19,0,0,1,1
1,Common Cold,0,1,1,0,25,0,1,1,0
2,Eczema,0,1,1,0,25,0,1,1,0
3,Asthma,1,1,0,1,25,1,1,1,1
5,Eczema,1,0,0,0,25,0,1,1,1


In [94]:
# Encoding disease column
encoder = LabelEncoder()
data["disease"] = encoder.fit_transform(data["disease"])

# Save the encoder mapping for later
disease_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(disease_mapping)


{'Acne': 0, 'Allergic Rhinitis': 1, "Alzheimer's Disease": 2, 'Anemia': 3, 'Anxiety Disorders': 4, 'Appendicitis': 5, 'Asthma': 6, 'Atherosclerosis': 7, 'Autism Spectrum Disorder (ASD)': 8, 'Bipolar Disorder': 9, 'Bladder Cancer': 10, 'Brain Tumor': 11, 'Breast Cancer': 12, 'Bronchitis': 13, 'Cataracts': 14, 'Cerebral Palsy': 15, 'Chickenpox': 16, 'Cholecystitis': 17, 'Cholera': 18, 'Chronic Kidney Disease': 19, 'Chronic Obstructive Pulmonary Disease (COPD)': 20, 'Chronic Obstructive Pulmonary...': 21, 'Cirrhosis': 22, 'Colorectal Cancer': 23, 'Common Cold': 24, 'Conjunctivitis (Pink Eye)': 25, 'Coronary Artery Disease': 26, "Crohn's Disease": 27, 'Cystic Fibrosis': 28, 'Dementia': 29, 'Dengue Fever': 30, 'Depression': 31, 'Diabetes': 32, 'Diverticulitis': 33, 'Down Syndrome': 34, 'Eating Disorders (Anorexia,...': 35, 'Ebola Virus': 36, 'Eczema': 37, 'Endometriosis': 38, 'Epilepsy': 39, 'Esophageal Cancer': 40, 'Fibromyalgia': 41, 'Gastroenteritis': 42, 'Glaucoma': 43, 'Gout': 44, 'HIV

In [95]:
# Normalize the age column

scaler = MinMaxScaler()
data["age"] = scaler.fit_transform(data[["age"]])

# Verify scaling
data.head()


Unnamed: 0,disease,fever,cough,fatigue,difficulty_breathing,age,gender,blood_pressure,cholesterol_level,outcome_variable
0,56,1,0,1,1,0.0,0,0,1,1
1,24,0,1,1,0,0.084507,0,1,1,0
2,37,0,1,1,0,0.084507,0,1,1,0
3,6,1,1,0,1,0.084507,1,1,1,1
5,37,1,0,0,0,0.084507,0,1,1,1


In [96]:
data.describe

<bound method NDFrame.describe of      disease  fever  cough  fatigue  difficulty_breathing       age  gender  \
0         56      1      0        1                     1  0.000000       0   
1         24      0      1        1                     0  0.084507       0   
2         37      0      1        1                     0  0.084507       0   
3          6      1      1        0                     1  0.084507       1   
5         37      1      0        0                     0  0.084507       0   
..       ...    ...    ...      ...                   ...       ...     ...   
341      106      1      1        1                     0  0.718310       0   
342      114      0      0        1                     0  0.718310       0   
343      101      1      0        1                     0  0.859155       0   
345      101      1      0        1                     0  0.929577       1   
347      101      1      0        1                     0  1.000000       0   

     blood_pressu

In [97]:
data.to_csv('./cleaned-symptoms-data.csv', index=False)