In [1]:
import pandas as pd
import random
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Load dataset
df = pd.read_csv("C:/Users/sooda/Downloads/Healthcare_dataset/dataset.csv")

In [7]:
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [9]:
# Extract symptoms and disease
symptom_cols = [col for col in df.columns if 'Symptom' in col]
df['combined_symptoms'] = df[symptom_cols].apply(lambda row: ' '.join([str(x).strip() for x in row if pd.notna(x)]), axis=1)

In [11]:
# Prepare symptom sentence
def symptoms_to_sentence(symptoms_str):
    symptoms = symptoms_str.split()
    selected = random.sample(symptoms, min(2, len(symptoms)))
    return f"I have been experiencing {', '.join(selected)}."

df['symptom_sentence'] = df['combined_symptoms'].apply(symptoms_to_sentence)

In [15]:
# Build Symptom Phrase Dictionary
all_symptoms = set()
for col in symptom_cols:
    all_symptoms.update(df[col].dropna().unique())

symptom_map = {}
for sym in all_symptoms:
    with_space = sym.replace('_', ' ').lower()
    symptom_map[with_space] = sym.lower()

In [17]:
# Function to normalize input
def normalize_input(text, symptom_map):
    text = text.lower()
    for phrase, underscored in symptom_map.items():
        text = re.sub(rf'\b{re.escape(phrase)}\b', underscored, text)
    return text

In [19]:
# Vectorize original (clean) symptom sentences
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['symptom_sentence'])
y = df['Disease']

In [21]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [25]:
# Predict & Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.818089430894309
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.71      0.83      0.77        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       0.85      0.92      0.88        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      0.78      0.88        23
                       Bronchial Asthma       0.82      1.00      0.90        33
                   Cervical spondylosis       0.92      1.00      0.96        23
                            Chicken pox       0.65      0.62      0.63        21
                    Chronic cholestasis       0.28      0.53      0.36        15
                            Common Cold       0.94      0.70      0.80        23

In [27]:
# Save model & vectorizer
with open('chatbot_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save symptom_map
with open('symptom_map.pkl', 'wb') as f:
    pickle.dump(symptom_map, f)