In [1]:
import pandas as pd

# Load your dataset
df = pd.read_csv("filled_dataset.csv")

# Replace '0' strings and actual NaNs with None
df.replace('0', pd.NA, inplace=True)
df.fillna(value=pd.NA, inplace=True)

# Build disease-to-symptom set mapping
disease_symptom_map = df.groupby('Disease').agg(lambda x: set(x.dropna())).apply(lambda row: set().union(*row), axis=1).to_dict()

# Clean symptom name
def clean_symptom(symptom):
    if pd.isna(symptom):
        return None
    return str(symptom).strip().lower().replace('_', ' ')

# Function to replace missing symptoms and ensure 10 unique ones
def replace_and_extend(row):
    disease = row['Disease']
    symptoms = row.drop('Disease').tolist()
    
    # Clean existing symptoms
    known = [clean_symptom(s) for s in symptoms if pd.notna(s)]
    unique_known = list(dict.fromkeys(known))  # Preserve order, remove duplicates
    
    # Find extra symptoms from disease pool
    available = list(disease_symptom_map[disease] - set(unique_known))
    extra = [clean_symptom(s) for s in available if clean_symptom(s) not in unique_known]
    
    # Fill to reach 10
    filled = unique_known + extra[:10 - len(unique_known)]
    filled += [None] * (10 - len(filled))
    
    return pd.Series([disease] + filled)

# Apply to each row
df_filled = df.apply(replace_and_extend, axis=1)

# Rename columns
df_filled.columns = ['Disease'] + [f'Symptom_{i}' for i in range(1, 11)]

# Save the result
df_filled.to_csv("final_filled_10_symptoms.csv", index=False)
print("Saved as 'final_filled_10_symptoms.csv'")

Saved as 'final_filled_10_symptoms.csv'


In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
df = pd.read_csv('final_filled_10_symptoms.csv')  # Replace with your actual file path

# Step 2: Drop columns with all NaN values
df = df.dropna(axis=1, how='all')

# Step 3: Fill missing values with the most frequent value in each column
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = imputer.fit_transform(df)
df_cleaned = pd.DataFrame(df_imputed, columns=df.columns)

# Step 4: Encode the target column (assumed name 'Disease')
label_encoder = LabelEncoder()
df_cleaned['Disease'] = label_encoder.fit_transform(df_cleaned['Disease'])

# Step 5: Encode the symptom columns (Label Encoding used here)
symptom_cols = ['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6']
symptom_encoders = {}

for col in symptom_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    symptom_encoders[col] = le  # Save encoder for prediction use later

# Step 6: Define features and labels
X = df_cleaned[symptom_cols]
y = df_cleaned['Disease']

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 9: Predict and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 99.70%


In [5]:
# Example new input symptoms
new_input = {
    'Symptom_1': 'vomiting',
    'Symptom_2': 'fatigue',
    'Symptom_3': 'high fever',
    'Symptom_4': 'headache',
    'Symptom_5': 'nausea',
    'Symptom_6': 'loss of appetite'
}

# Encode the input using saved LabelEncoders
encoded_input = []

for col in symptom_cols:
    value = new_input[col]
    if value in symptom_encoders[col].classes_:
        encoded_value = symptom_encoders[col].transform([value])[0]
    else:
        print(f"Warning: '{value}' not seen during training. Using 0 as default.")
        encoded_value = 0  # or handle differently
    encoded_input.append(encoded_value)

# Convert to 2D array for prediction
import numpy as np
encoded_input = np.array(encoded_input).reshape(1, -1)

# Predict the disease
predicted_label = model.predict(encoded_input)[0]
predicted_disease = label_encoder.inverse_transform([predicted_label])[0]

print(f"Predicted Disease: {predicted_disease}")


Predicted Disease: Hypoglycemia




In [6]:
# User-friendly command-line interface
print("Please enter six symptoms (exactly as they appeared in the dataset).")

user_input = {}
for i in range(1, 7):
    symptom = input(f"Enter Symptom_{i}: ").strip().lower()
    user_input[f'Symptom_{i}'] = symptom

# Encode user input
encoded_input = []
for col in symptom_cols:
    value = user_input[col]
    classes = symptom_encoders[col].classes_
    if value in classes:
        encoded_value = symptom_encoders[col].transform([value])[0]
    else:
        print(f"Warning: '{value}' not seen during training. Using default (0).")
        encoded_value = 0
    encoded_input.append(encoded_value)

# Convert to 2D array and predict
encoded_input = np.array(encoded_input).reshape(1, -1)
predicted_label = model.predict(encoded_input)[0]
predicted_disease = label_encoder.inverse_transform([predicted_label])[0]

print(f"\n🩺 Predicted Disease: {predicted_disease}")


Please enter six symptoms (exactly as they appeared in the dataset).


Enter Symptom_1:  vomiting
Enter Symptom_2:  fatigue
Enter Symptom_3:  high fever
Enter Symptom_4:  headache
Enter Symptom_5:  nausea
Enter Symptom_6:  loss of appetite



🩺 Predicted Disease: Hypoglycemia




In [7]:
import tkinter as tk
from tkinter import ttk, messagebox
import numpy as np

# Create the main window
root = tk.Tk()
root.title("Disease Predictor from Symptoms")
root.geometry("400x400")

# Instructions label
tk.Label(root, text="Select 6 symptoms:", font=("Arial", 12, "bold")).pack(pady=10)

# Dropdowns for 6 symptoms
symptom_vars = []
dropdowns = []

for i in range(6):
    var = tk.StringVar()
    symptom_vars.append(var)
    
    # Use the symptom_1 encoder class as options (assuming similar across all)
    options = sorted(list(symptom_encoders[f'Symptom_{i+1}'].classes_))
    var.set(options[0])  # default selection

    lbl = tk.Label(root, text=f"Symptom {i+1}:")
    lbl.pack()
    dropdown = ttk.Combobox(root, textvariable=var, values=options, state="readonly")
    dropdown.pack(pady=5)
    dropdowns.append(dropdown)

# Prediction function
def predict_disease():
    encoded_input = []
    try:
        for i, var in enumerate(symptom_vars):
            val = var.get()
            encoder = symptom_encoders[f'Symptom_{i+1}']
            encoded = encoder.transform([val])[0]
            encoded_input.append(encoded)
        
        input_array = np.array(encoded_input).reshape(1, -1)
        pred = model.predict(input_array)[0]
        disease = label_encoder.inverse_transform([pred])[0]
        
        messagebox.showinfo("Prediction Result", f"🩺 Predicted Disease: {disease}")
    except Exception as e:
        messagebox.showerror("Error", f"Prediction failed: {str(e)}")

# Predict button
predict_btn = tk.Button(root, text="Predict Disease", command=predict_disease, bg="green", fg="white", font=("Arial", 12, "bold"))
predict_btn.pack(pady=20)

# Run the GUI loop
root.mainloop()




In [10]:
import pickle

# Assuming `model` is your trained model (e.g., RandomForestClassifier)
model_filename = '1disease_model.pkl'

# Save model to disk
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)


In [12]:
import pandas as pd

df = pd.read_csv("final_filled_10_symptoms.csv")
X = df[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6']]


In [13]:
columns = X.columns.tolist()

In [14]:
import pickle

columns_filename = 'columns.pkl'
with open(columns_filename, 'wb') as file:
    pickle.dump(columns, file)
