# 01 — Data Cleaning & Standardization (Medical Symptoms)

We clean and standardize symptom data and convert patient cases into transactions.


In [5]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path('../data/dataset.csv')
OUT_PATH = Path('../outputs/symptom_cleaned_transactions.csv')
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


## Step 1 — Remove non-symptom columns and clean text


In [6]:
symptom_cols = [c for c in df.columns if c.lower().startswith('symptom')]
df_sym = df[symptom_cols].copy()

def clean_symptom(x):
    if pd.isna(x): return None
    return str(x).lower().strip().replace('_',' ')

for c in symptom_cols:
    df_sym[c] = df_sym[c].apply(clean_symptom)

df_sym.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


## Step 2 — Build transactions (one row = one patient)


In [7]:
transactions = []
for _, row in df_sym.iterrows():
    items = sorted(set(x for x in row if x))
    if len(items) >= 2:
        transactions.append(items)

print('Total transactions:', len(transactions))
print('Example:', transactions[0])

Total transactions: 4920
Example: ['dischromic  patches', 'itching', 'nodal skin eruptions', 'skin rash']


## Step 3 — Save cleaned transactions


In [8]:
out_df = pd.DataFrame({
    'transaction_id': range(1, len(transactions)+1),
    'items': [','.join(t) for t in transactions]
})
out_df.to_csv(OUT_PATH, index=False)
OUT_PATH

PosixPath('../outputs/symptom_cleaned_transactions.csv')