In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import random

This dataset is available here: https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset/data

In [3]:
df = pd.read_csv("data/disease-symptom-description-dataset.csv")

In [4]:
df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


In [5]:
# Flatten the symptom columns into a single list and remove NaN values
all_symptoms = pd.unique(df.iloc[:, 1:].values.ravel('K'))

# Filter out NaN values
all_symptoms = all_symptoms[~pd.isna(all_symptoms)]

# Display the unique symptoms
print(all_symptoms)

['itching' ' skin_rash' ' continuous_sneezing' ' shivering'
 ' stomach_pain' ' acidity' ' vomiting' ' indigestion' ' muscle_wasting'
 ' patches_in_throat' ' fatigue' ' weight_loss' ' sunken_eyes' ' cough'
 ' headache' ' chest_pain' ' back_pain' ' weakness_in_limbs' ' chills'
 ' joint_pain' ' yellowish_skin' ' constipation'
 ' pain_during_bowel_movements' ' breathlessness' ' cramps' ' weight_gain'
 ' mood_swings' ' neck_pain' ' muscle_weakness' ' stiff_neck'
 ' pus_filled_pimples' ' burning_micturition' ' bladder_discomfort'
 ' high_fever' ' nodal_skin_eruptions' ' ulcers_on_tongue'
 ' loss_of_appetite' ' restlessness' ' dehydration' ' dizziness'
 ' weakness_of_one_body_side' ' lethargy' ' nausea' ' abdominal_pain'
 ' pain_in_anal_region' ' sweating' ' bruising' ' cold_hands_and_feets'
 ' anxiety' ' knee_pain' ' swelling_joints' ' blackheads'
 ' foul_smell_of urine' ' skin_peeling' ' blister' ' dischromic _patches'
 ' watering_from_eyes' ' extra_marital_contacts' ' diarrhoea'
 ' loss_of

In [6]:
df_original = pd.read_csv('data/disease_symptoms_patient_profile.csv')  # Original dataset
df_additional = df

# Clean column names by stripping any spaces
df_additional.columns = df_additional.columns.str.strip()

# Define the symptom columns in the original dataset
symptom_columns = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']

# Function to map symptoms from the second dataset to the original dataset
def map_symptoms_to_columns(symptoms):
    # Initialize all symptoms as 'No' by default
    mapped = {
        'Fever': 'Yes' if 'fever' in symptoms else 'No',
        'Cough': 'Yes' if 'cough' in symptoms else 'No',
        'Fatigue': 'Yes' if 'fatigue' in symptoms else 'No',
        'Difficulty Breathing': 'Yes' if 'difficulty_breathing' in symptoms else 'No'
    }
    return mapped

# Function to randomly assign missing data for Age, Gender, Blood Pressure, Cholesterol Level
def assign_missing_data(row):
    row['Age'] = random.randint(18, 90)  # Random age between 18 and 90
    row['Gender'] = random.choice(['Male', 'Female'])  # Random gender
    row['Blood Pressure'] = random.choice(['Low', 'Normal', 'High'])  # Random blood pressure
    row['Cholesterol Level'] = random.choice(['Normal', 'High'])  # Random cholesterol level
    
#     from sklearn.impute import SimpleImputer

#     ## Impute missing values for numerical columns 
#     numerical_columns = ['Age']
#     imputer_num = SimpleImputer(strategy='mean')  # Impute with the mean for numerical columns
#     df_combined[numerical_columns] = imputer_num.fit_transform(df_combined[numerical_columns])

#     # Impute missing values for categorical columns 
#     categorical_columns = ['Gender', 'Blood Pressure', 'Cholesterol Level']
#     imputer_cat = SimpleImputer(strategy='most_frequent')  # Impute with the most frequent value for categorical columns
#     df_combined[categorical_columns] = imputer_cat.fit_transform(df_combined[categorical_columns])

    return row

# Create a list to store new rows
new_rows = []

# Iterate through the second dataset and map the symptoms
for index, row in df_additional.iterrows():
    disease = row['Disease']
    symptoms = row.dropna().tolist()[1:]  # Drop 'Disease' and NaN values (Symptoms start from index 1)
    
    # Map the symptoms to the relevant columns in the original dataset
    mapped_symptoms = map_symptoms_to_columns(symptoms)
    
    # Create a new row with mapped symptoms and fill missing values
    new_row = {
        'Disease': disease,
        'Fever': mapped_symptoms['Fever'],
        'Cough': mapped_symptoms['Cough'],
        'Fatigue': mapped_symptoms['Fatigue'],
        'Difficulty Breathing': mapped_symptoms['Difficulty Breathing'],
        'Age': None,  # This will be filled later
        'Gender': None,  # This will be filled later
        'Blood Pressure': None,  # This will be filled later
        'Cholesterol Level': None,  # This will be filled later
        'Outcome Variable': 'Positive'
    }
    
    new_rows.append(new_row)

# Create a DataFrame from the new rows
new_data = pd.DataFrame(new_rows)

# Apply the function to fill missing data (Age, Gender, Blood Pressure, Cholesterol Level)
new_data = new_data.apply(assign_missing_data, axis=1)

# Combine the new data with the original dataset
df_combined = pd.concat([df_original, new_data], ignore_index=True)

# Verify the final DataFrame
print(df_combined.head())
print(df_combined.shape)  # Check the size of the combined dataset

# Optionally, check for any missing data in the combined dataset
print(df_combined.isna().sum())


       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  
(5269, 10)
Disease                 0
Fever                   0
Cough                   0
Fatigue                 0
Difficulty Breathing    0
Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       

In [7]:
df_combined

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
...,...,...,...,...,...,...,...,...,...,...
5264,(vertigo) Paroymsal Positional Vertigo,No,No,No,No,67,Male,Low,High,Positive
5265,Acne,No,No,No,No,64,Female,Normal,High,Positive
5266,Urinary tract infection,No,No,No,No,73,Female,Low,High,Positive
5267,Psoriasis,No,No,No,No,42,Female,Normal,High,Positive


In [8]:
df_combined.to_csv("data/combined_datasets.csv")