In [1]:
import pandas as pd
import json
from pathlib import Path

# Load one JSONL file (first batch of 100 reports)
file = Path("../data/raw/openfda_page_000.jsonl")

records = []
with file.open("r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

# Flatten nested JSON into a DataFrame
df = pd.json_normalize(records)
df.head()

Unnamed: 0,safetyreportversion,safetyreportid,primarysourcecountry,occurcountry,transmissiondateformat,transmissiondate,reporttype,serious,receivedateformat,receivedate,...,patient.patientonsetageunit,patient.patientagegroup,seriousnesshospitalization,seriousnessother,patient.patientweight,seriousnesslifethreatening,patient.summary.narrativeincludeclinical,seriousnessdeath,seriousnessdisabling,authoritynumb
0,2,10003304,US,US,102,20141212,1,2,102,20140312,...,,,,,,,,,,
1,3,10003310,US,US,102,20151125,1,2,102,20140312,...,801.0,6.0,,,,,,,,
2,1,10003349,US,US,102,20141002,1,1,102,20140312,...,,,1.0,1.0,,,,,,
3,3,10003432,US,US,102,20151125,1,1,102,20140312,...,801.0,6.0,1.0,,,,,,,
4,2,10003582,US,US,102,20141002,1,2,102,20140312,...,801.0,,,,82.1,,,,,


In [2]:
# Expand reactions into one row per reaction
rows = []
for rec in records:
    base = {
        "safetyreportid": rec.get("safetyreportid"),
        "receivedate": rec.get("receivedate"),
        "sex": rec.get("patient", {}).get("patientsex"),
        "age": rec.get("patient", {}).get("patientonsetage"),
        "age_unit": rec.get("patient", {}).get("patientonsetageunit"),
        "seriousnessdeath": rec.get("seriousnessdeath"),
        "seriousnesshospitalization": rec.get("seriousnesshospitalization"),
    }

    # reactions (list of dicts)
    for reaction in rec.get("patient", {}).get("reaction", []):
        row = base.copy()
        row["reaction"] = reaction.get("reactionmeddrapt")
        rows.append(row)

df_clean = pd.DataFrame(rows)
df_clean.head(10)


Unnamed: 0,safetyreportid,receivedate,sex,age,age_unit,seriousnessdeath,seriousnesshospitalization,reaction
0,10003304,20140312,2,,,,,Drug hypersensitivity
1,10003310,20140312,2,66.0,801.0,,,Back pain
2,10003349,20140312,2,,,,1.0,Cerebrovascular accident
3,10003349,20140312,2,,,,1.0,Blood pressure increased
4,10003349,20140312,2,,,,1.0,Pain
5,10003432,20140312,2,84.0,801.0,,1.0,Oedema peripheral
6,10003432,20140312,2,84.0,801.0,,1.0,Fluid retention
7,10003582,20140312,1,58.0,801.0,,,Hypertension
8,10003582,20140312,1,58.0,801.0,,,Dehydration
9,10003582,20140312,1,58.0,801.0,,,Night sweats


In [3]:
df_clean["reaction"].value_counts().head(20)


reaction
Rash                            9
Drug interaction                8
Dizziness                       7
Type 2 diabetes mellitus        7
Fatigue                         7
Pain                            6
Headache                        6
Insomnia                        5
Weight increased                5
Dyspnoea                        5
Nausea                          5
Diarrhoea                       5
Haemoglobin decreased           5
Gastrointestinal haemorrhage    5
Hypertension                    5
Contusion                       4
Oedema peripheral               4
Drug administration error       4
Platelet count decreased        3
Blood count abnormal            3
Name: count, dtype: int64

In [4]:
out_path = Path("../data/processed/aspirin_clean.csv")
df_clean.to_csv(out_path, index=False)
print(f"Saved cleaned data to {out_path}")

Saved cleaned data to ../data/processed/aspirin_clean.csv
