# 02 — Association Rule Mining (Medical Symptoms)

We mine frequent symptom patterns using Apriori.


In [1]:
import pandas as pd
from pathlib import Path
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from collections import Counter
import matplotlib.pyplot as plt

DATA_PATH = Path('../outputs/symptom_cleaned_transactions.csv')
OUTDIR = Path('../outputs')

df = pd.read_csv(DATA_PATH)
transactions = df['items'].apply(lambda x: x.split(',')).tolist()
len(transactions)

4920

## One-hot encoding


In [2]:
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
basket = pd.DataFrame(te_array, columns=te.columns_)
basket.head()

Unnamed: 0,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,...,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Frequent itemsets


In [3]:
itemsets = apriori(basket, min_support=0.02, use_colnames=True, max_len=3)
itemsets['length'] = itemsets['itemsets'].apply(len)
itemsets = itemsets.sort_values('support', ascending=False)
itemsets.head(10)

Unnamed: 0,support,itemsets,length
41,0.392683,(fatigue),1
121,0.389024,(vomiting),1
45,0.276829,(high fever),1
60,0.234146,(loss of appetite),1
71,0.232927,(nausea),1
44,0.230488,(headache),1
0,0.209756,(abdominal pain),1
602,0.19878,"(high fever, fatigue)",2
929,0.19878,"(nausea, vomiting)",2
130,0.185366,(yellowish skin),1


In [4]:
itemsets.to_csv(OUTDIR / 'frequent_itemsets.csv', index=False)

## Association rules


In [5]:
rules = association_rules(itemsets, metric='confidence', min_threshold=0.6)
rules = rules[rules['lift'] > 1.2]
rules = rules.sort_values(['lift','confidence'], ascending=False)
rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1047,"(depression, abnormal menstruation)",(enlarged thyroid),0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1050,(enlarged thyroid),"(depression, abnormal menstruation)",0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1051,"(swollen extremeties, enlarged thyroid)",(brittle nails),0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1052,"(enlarged thyroid, brittle nails)",(swollen extremeties),0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1053,"(swollen extremeties, brittle nails)",(enlarged thyroid),0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1054,(enlarged thyroid),"(swollen extremeties, brittle nails)",0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1055,(swollen extremeties),"(enlarged thyroid, brittle nails)",0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1056,(brittle nails),"(swollen extremeties, enlarged thyroid)",0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1061,"(chest pain, loss of appetite)",(blood in sputum),0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0
1062,(blood in sputum),"(chest pain, loss of appetite)",0.02439,0.02439,0.02439,1.0,41.0,1.0,0.023795,inf,1.0,1.0,1.0,1.0


In [6]:
rules_out = rules.copy()
rules_out['antecedents'] = rules_out['antecedents'].apply(lambda s: ', '.join(sorted(s)))
rules_out['consequents'] = rules_out['consequents'].apply(lambda s: ', '.join(sorted(s)))
rules_out.to_csv(OUTDIR / 'association_rules.csv', index=False)

## Visualization: most common symptoms


In [7]:
counter = Counter(sym for t in transactions for sym in t)
top = counter.most_common(15)
items, counts = zip(*top)

plt.figure(figsize=(8,5))
plt.barh(items[::-1], counts[::-1])
plt.title('Top Co-occurring Symptoms')
plt.tight_layout()
plt.savefig(OUTDIR / 'top_symptoms.png', dpi=200)
plt.close()