In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load train and test datasets
train_df = pd.read_csv("/kaggle/input/dataset2/symptom-disease-train-dataset.csv")
test_df = pd.read_csv("/kaggle/input/dataset2/symptom-disease-test-dataset.csv")

# Drop missing rows if any
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Filter test set to only include labels seen in training set
valid_labels = set(train_df['label'])
test_df = test_df[test_df['label'].isin(valid_labels)]

# Extract features and labels
X_train_text = train_df['text']
y_train = train_df['label']
X_test_text = test_df['text']
y_test = test_df['label']

# Encode labels using only train labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train_encoded)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"\n✅ Test Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")

from sklearn.metrics import classification_report
import numpy as np

# Only use labels that appear in predictions and test data
used_classes = np.unique(np.concatenate([y_test_encoded, y_pred]))
valid_indices = used_classes[used_classes < len(label_encoder.classes_)]

# Ensure all target names are strings
target_names = [str(label_encoder.classes_[i]) for i in valid_indices]

# Print classification report
print(classification_report(y_test_encoded, y_pred, labels=valid_indices, target_names=target_names, zero_division=0))


# Save model, encoder, and vectorizer for future use
joblib.dump(model, "/kaggle/working/disease_model.pkl")
joblib.dump(vectorizer, "/kaggle/working/vectorizer.pkl")
joblib.dump(label_encoder, "/kaggle/working/label_encoder.pkl")
print("\n✅ Model, vectorizer, and label encoder saved to /kaggle/working/")



✅ Test Accuracy: 98.49%

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           7       1.00      1.00      1.00        21
          27       1.00      1.00      1.00        21
          33       1.00      1.00      1.00        27
          35       1.00      1.00      1.00        27
          72       0.97      1.00      0.99        36
         149       0.93      1.00      0.96        37
         186       0.95      1.00      0.98        20
         193       0.97      0.94      0.96        34
         207       1.00      1.00      1.00        23
         221       0.00      0.00      0.00         0
         234       0.97      1.00      0.98        28
         275       0.97      0.94      0.96        36
         284       1.00      0.65      0.79        17
         285       1.00      1.00      1.00        26
         297       1.00      1.00      1.00        32
         308       0.94      1.

In [2]:
import json

# Load the mapping file
with open('/kaggle/input/dataset2/mapping.json', 'r') as f:
    raw_map = json.load(f)

# Extract all disease names (keys)
disease_names = list(raw_map.keys())

# Print the total number and sample names
print(f"✅ Total diseases: {len(disease_names)}\n")

# Print all disease names
for i, name in enumerate(disease_names, 1):
    print(f"{i}. {name}")


✅ Total diseases: 1082

1. (Vertigo) Paroymsal  Positional Vertigo
2. Abdominal Aortic Aneurysm
3. Acanthosis Nigricans
4. Achalasia
5. Achilles Tendinitis
6. Achilles Tendon Rupture
7. Acl Injury
8. Acne
9. Acoustic Neuroma
10. Acromegaly
11. Actinic Keratosis
12. Acute Coronary Syndrome
13. Acute Flaccid Myelitis
14. Acute Liver Failure
15. Acute Lymphocytic Leukemia
16. Acute Myelogenous Leukemia
17. Acute Sinusitis
18. Addisons Disease
19. Adenomyosis
20. Adhd
21. Adjustment Disorders
22. Adrenal Cancer
23. Adult Adhd
24. Adult Congenital Heart Disease
25. Adult Stills Disease
26. Age Spots
27. Agoraphobia
28. Aids
29. Airplane Ear
30. Albinism
31. Alcohol Intolerance
32. Alcohol Poisoning
33. Alcohol Use Disorder
34. Alcoholic Hepatitis
35. Allergies
36. Allergy
37. Alpha Gal Syndrome
38. Alzheimers Disease
39. Ambiguous Genitalia
40. Ameloblastoma
41. Amenorrhea
42. Amnesia
43. Ampullary Cancer
44. Amyloidosis
45. Amyotrophic Lateral Sclerosis
46. Anal Cancer
47. Anal Fissure
48.