In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

df = pd.read_csv('datasets/dataset.csv')
s_description = pd.read_csv('datasets/symptom_Description.csv')
s_precaution = pd.read_csv('datasets/symptom_precaution.csv')
s_severity = pd.read_csv('datasets/symptom_severity.csv')    

### 1. Data Preparation and cleaning for apriori algorithm

In [323]:
# Get number of transactions, where each transaction is a disease with its symptoms
total_transactions = df.groupby('Disease').size().shape[0]
print(f"Total Unique Disease Profiles (Transactions): {total_transactions}")

Total Unique Disease Profiles (Transactions): 41


In [324]:
symptoms_columns = [col for col in df.columns if 'Symptom' in col]
df_long = df.melt(id_vars=['Disease'], 
                  value_vars=symptoms_columns, 
                  value_name='Symptom')
df_long.head()

Unnamed: 0,Disease,variable,Symptom
0,Fungal infection,Symptom_1,itching
1,Fungal infection,Symptom_1,skin_rash
2,Fungal infection,Symptom_1,itching
3,Fungal infection,Symptom_1,itching
4,Fungal infection,Symptom_1,itching


In [325]:
# Data Cleaning
df_long['Symptom'] = df_long['Symptom'].str.lower().str.strip()
df_long.dropna(subset=['Symptom'], inplace=True)
df_long['Symptom'] = df_long['Symptom'].str.replace('_', ' ')

In [326]:
# Normalizing synonyms by creating a mapping dictionary
synonym_map = {
    # ------------------ FEVER / CHILLS / TEMP ------------------
    'high fever': 'fever',
    'mild fever': 'fever',
    'shivering': 'chills',
    'cold hands and feets': 'cold extremities',
    'toxic look (typhos)': 'fever',
    'pyrexia': 'fever',
    
    # ------------------ PAIN / ACHES ------------------
    'stomach pain': 'abdominal pain',
    'belly pain': 'abdominal pain',
    'chest pain': 'pain',
    'back pain': 'pain',
    'muscle pain': 'pain',
    'pain behind the eyes': 'headache',
    'hip joint pain': 'joint pain',
    'knee pain': 'joint pain',
    'painful walking': 'pain',
    'neck pain': 'stiff neck', 
    
    # ------------------ WEAKNESS / FATIGUE / BALANCE ------------------
    'weakness in limbs': 'muscle weakness',
    'muscle wasting': 'muscle weakness',
    'lethargy': 'fatigue',
    'malaise': 'fatigue',
    'unsteadiness': 'dizziness',
    'loss of balance': 'dizziness',
    'spinning movements': 'dizziness',
    'altered sensorium': 'dizziness',
    'weakness of one body side': 'muscle weakness', 
    
    # ------------------ SKIN / RASHES ------------------
    'pus filled pimples': 'skin rash',
    'nodal skin eruptions': 'skin rash',
    'skin peeling': 'skin rash',
    'blister': 'skin rash',
    'scurring': 'skin rash',
    'silver like dusting': 'skin rash',
    'red spots over body': 'skin rash',
    'dischromic patches': 'skin rash', 
    'yellow crust ooze': 'skin rash', 
    'internal itching': 'itching', 
    
    # ------------------ DIGESTIVE / EXCRETORY / DISCHARGE ------------------
    'nausea': 'vomiting',
    'acidity': 'indigestion',
    'irritation in anus': 'pain in anal region',
    'foul smell of urine': 'abnormal urine',
    'yellow urine': 'abnormal urine',
    'dark urine': 'abnormal urine',
    'spotting urination': 'burning micturition',
    'continuous feel of urine': 'polyuria',
    'mucoid sputum': 'phlegm',
    'rusty sputum': 'phlegm',
    'blood in sputum': 'phlegm',
    'ulcers on tongue': 'patches in throat',
    
    # ------------------ SWELLING / FLUID ------------------
    'puffy face and eyes': 'fluid overload',
    'swollen extremeties': 'fluid overload',
    'swollen legs': 'fluid overload',
    'swelling of stomach': 'distention of abdomen',
    'swelling joints': 'joint pain',
    'swollen blood vessels': 'prominent veins on calf',
    'swelled lymph nodes': 'swollen lymph nodes',
    
    # ------------------ RESPIRATORY / NASAL ------------------
    'runny nose': 'continuous sneezing',
    'congestion': 'continuous sneezing',
    'sinus pressure': 'headache',
    'throat irritation': 'cough', 
    
    # ------------------ CARDIOVASCULAR ------------------
    'fast heart rate': 'palpitations'
}

print("Before Normalization:", df_long['Symptom'].nunique())
df_long['Symptom'] = df_long['Symptom'].replace(synonym_map)
print("After Normalization:", df_long['Symptom'].nunique())

Before Normalization: 131
After Normalization: 80


In [327]:
# 4. One-Hot Encode Transactions
basket_sets = pd.get_dummies(df_long['Symptom'], prefix='symptom_') # this function is used to convert categorical variables into dummy or indicator variables
basket_sets['Disease'] = df_long['Disease'] # this adds the 'Disease' column back to the one-hot encoded DataFrame

In [328]:
# This creates the final transaction matrix ('basket')
basket = basket_sets.groupby('Disease').max().reset_index().set_index('Disease')
# Final check to ensure binary format (0 or 1)
basket = basket.clip(upper=1)

In [329]:
print("Data organization complete. The 'basket' DataFrame is ready for Apriori mining.")
print(f"Resulting 'basket' structure: {basket.shape[0]} diseases (baskets) and {basket.shape[1]} symptoms (items).")
print("\nFirst 5 rows of the 'basket' DataFrame (Disease vs. Symptom presence):")
basket.head()

Data organization complete. The 'basket' DataFrame is ready for Apriori mining.
Resulting 'basket' structure: 41 diseases (baskets) and 80 symptoms (items).

First 5 rows of the 'basket' DataFrame (Disease vs. Symptom presence):


Unnamed: 0_level_0,symptom__abdominal pain,symptom__abnormal menstruation,symptom__abnormal urine,symptom__acute liver failure,symptom__anxiety,symptom__blackheads,symptom__bladder discomfort,symptom__bloody stool,symptom__blurred and distorted vision,symptom__breathlessness,...,symptom__sunken eyes,symptom__sweating,symptom__swollen lymph nodes,symptom__visual disturbances,symptom__vomiting,symptom__watering from eyes,symptom__weight gain,symptom__weight loss,symptom__yellowing of eyes,symptom__yellowish skin
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo) Paroymsal Positional Vertigo,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
AIDS,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Acne,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Alcoholic hepatitis,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
Allergy,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


References: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ 

In [330]:
import numpy as np
MIN_SUPPORT = 0.15

MIN_ABSOLUTE_COUNT = int(np.ceil(basket.shape[0] * MIN_SUPPORT))

print(f"\n--- Apriori Analysis Parameters ---")
print(f"Total Transactions (Diseases): {basket.shape[0]}")
print(f"Minimum Support (MIN_SUPPORT): {MIN_SUPPORT} ({MIN_SUPPORT*100:.0f}%)")
print(f"Minimum Absolute Count Required: {MIN_ABSOLUTE_COUNT} diseases")
print("-" * 40)

frequent_itemsets = apriori(basket, min_support=MIN_SUPPORT, use_colnames=True)

# Add itemset length column
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Filter for co-occurrence patterns (itemsets with size >= 2)
co_occurring_symptoms = frequent_itemsets[frequent_itemsets['length'] >= 2].copy()

# Sort the results by support for clear analysis of the strongest co-occurrence patterns
co_occurring_symptoms.sort_values(by='support', ascending=False, inplace=True) # Sort by support descending
co_occurring_symptoms.reset_index(drop=True, inplace=True) # Reset index after sorting

print("\n--- Top Frequent Co-occurring Symptom Combinations ---")
print("These are symptom combinations that frequently appear together in the same disease profile.")
# Format output for better readability
co_occurring_symptoms['support'] = co_occurring_symptoms['support'].round(4)
co_occurring_symptoms['itemsets'] = co_occurring_symptoms['itemsets'].apply(lambda x: tuple(s.replace('symptom__', '').strip() for s in x))
print(co_occurring_symptoms[['support', 'itemsets']])

# --- Interpretation Summary ---
print("\n--- Interpretation Summary ---")
if not co_occurring_symptoms.empty:
    top_pattern = co_occurring_symptoms.iloc[0]
    symptoms = [s.replace('symptom__', '').strip() for s in list(top_pattern['itemsets'])] 
    
    print(f"The strongest co-occurrence pattern found is: {symptoms}")
    print(f"Support: {top_pattern['support']:.4f}")
    print(f"This pattern appears together in {top_pattern['support']*100:.2f}% of all diseases in the dataset (i.e., in {MIN_ABSOLUTE_COUNT} or more diseases).")
else:
    print("No frequent itemsets of size 2 or more were found. The data is highly sparse at MIN_SUPPORT=0.1.")


--- Apriori Analysis Parameters ---
Total Transactions (Diseases): 41
Minimum Support (MIN_SUPPORT): 0.15 (15%)
Minimum Absolute Count Required: 7 diseases
----------------------------------------

--- Top Frequent Co-occurring Symptom Combinations ---
These are symptom combinations that frequently appear together in the same disease profile.
   support                               itemsets
0   0.2195             (vomiting, abdominal pain)
1   0.2195                       (fatigue, fever)
2   0.1951                    (fatigue, vomiting)
3   0.1951           (loss of appetite, vomiting)
4   0.1707       (yellowish skin, abdominal pain)
5   0.1707            (fatigue, loss of appetite)
6   0.1707                      (vomiting, fever)
7   0.1707  (loss of appetite, yellowing of eyes)
8   0.1707             (yellowish skin, vomiting)

--- Interpretation Summary ---
The strongest co-occurrence pattern found is: ['vomiting', 'abdominal pain']
Support: 0.2195
This pattern appears together

### Data augmentation

In [331]:
import numpy as np
from mlxtend.frequent_patterns import apriori

# Create the mirrored dataset
basket_mirrored = basket.copy() 

# 1. Inject Noise: Randomly flip a small percentage (5%) of '1' entries to '0' 
noise_rate = 0.05

# Generate a random mask with the same shape as the mirrored basket, where each entry has a probability of 'noise_rate' to be True
mask = np.random.rand(*basket_mirrored.shape) < noise_rate
basket_mirrored = basket_mirrored.mask(mask & (basket_mirrored == 1), 0)

# 2. Rename and concatenate
basket_mirrored.index = basket_mirrored.index.astype(str) + '_NOISY_MIRROR'
basket_augmented = pd.concat([basket, basket_mirrored])

# --- End of Augmentation ---
MIN_SUPPORT = 0.15 # same support as above
MIN_ABSOLUTE_COUNT = int(np.ceil(basket_augmented.shape[0] * MIN_SUPPORT))

print(f"\n--- Apriori Analysis Parameters (Stochastically Augmented) ---")
print(f"Total Transactions (Diseases): {basket_augmented.shape[0]} (N=41 Original + N=41 Noisy Mirror)")
print(f"Minimum Support (MIN_SUPPORT): {MIN_SUPPORT} ({MIN_SUPPORT*100:.0f}%)")
print(f"Minimum Absolute Count Required: {MIN_ABSOLUTE_COUNT} diseases")
print("-" * 40)

# Run Apriori on the new stochastically augmented basket
frequent_itemsets = apriori(basket_augmented, min_support=MIN_SUPPORT, use_colnames=True)

# Add itemset length column
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Filter for co-occurrence patterns (itemsets with size >= 2)
co_occurring_symptoms = frequent_itemsets[frequent_itemsets['length'] >= 2].copy()

print("\n--- Top 10 Frequent Co-occurring Symptom Combinations ---")
print("These are symptom combinations that frequently appear together in the same disease profile.")

# Check if we have any results before formatting
if len(co_occurring_symptoms) > 0:
    # Sort the results by support for clear analysis of the strongest co-occurrence patterns
    co_occurring_symptoms.sort_values(by='support', ascending=False, inplace=True)
    co_occurring_symptoms.reset_index(drop=True, inplace=True)
    
    # Format output for better readability - convert support to numeric first
    co_occurring_symptoms['support'] = pd.to_numeric(co_occurring_symptoms['support']).round(4)
    co_occurring_symptoms['itemsets'] = co_occurring_symptoms['itemsets'].apply(lambda x: tuple(s.replace('symptom__', '').strip() for s in x))
    print(co_occurring_symptoms[['support', 'itemsets']].head(10))
    
    # --- Interpretation Summary ---
    print("\n--- Interpretation Summary ---")
    top_pattern = co_occurring_symptoms.iloc[0]
    # Clean up symptom names by removing prefix and extra underscores
    symptoms = [s.replace('symptom__', '').replace('_', ' ').strip() for s in list(top_pattern['itemsets'])] 
    
    print(f"The strongest co-occurrence pattern found is: {symptoms}")
    print(f"Support: {top_pattern['support']:.4f}")
    print(f"This pattern appears together in {top_pattern['support']*100:.2f}% of all diseases in the stochastically augmented dataset (i.e., in {MIN_ABSOLUTE_COUNT} or more transactions).")
else:
    print("No frequent itemsets of size 2 or more were found.")
    print(f"Try reducing MIN_SUPPORT below {MIN_SUPPORT} to find patterns.")
    print("Current dataset might be too sparse for the chosen support threshold.")


--- Apriori Analysis Parameters (Stochastically Augmented) ---
Total Transactions (Diseases): 82 (N=41 Original + N=41 Noisy Mirror)
Minimum Support (MIN_SUPPORT): 0.15 (15%)
Minimum Absolute Count Required: 13 diseases
----------------------------------------

--- Top 10 Frequent Co-occurring Symptom Combinations ---
These are symptom combinations that frequently appear together in the same disease profile.
   support                               itemsets
0   0.2073             (vomiting, abdominal pain)
1   0.2073                       (fatigue, fever)
2   0.1829                    (fatigue, vomiting)
3   0.1829           (loss of appetite, vomiting)
4   0.1707       (yellowish skin, abdominal pain)
5   0.1707            (fatigue, loss of appetite)
6   0.1707  (loss of appetite, yellowing of eyes)
7   0.1585                      (vomiting, fever)
8   0.1585             (yellowish skin, vomiting)

--- Interpretation Summary ---
The strongest co-occurrence pattern found is: ['vomitin

