In [47]:
import pandas as pd
import os

In [48]:
# Set path to your raw data
data_path = "/Users/a.avira/Pet/Portfolio_Projects/Rx_Risk_Radar/data/raw"

In [49]:
print(data_path)

/Users/a.avira/Pet/Portfolio_Projects/Rx_Risk_Radar/data/raw


In [50]:
#Checking if the working directory is working properly and all the files are there
os.listdir(data_path)

['THER20Q4.txt',
 'DEMO20Q4.txt',
 'RPSR20Q4.txt',
 'INDI20Q4.txt',
 'DRUG20Q4.txt',
 'FDA-FAERS-Data-Dictionary.pdf',
 'OUTC20Q4.txt',
 'REAC20Q4.txt']

In [67]:
# Load the pipe-delimited FAERS demo file
# demo = pd.read_csv("DEMO20Q4.txt", sep="$", encoding='latin1')

demo = pd.read_csv(os.path.join(data_path, "DEMO20Q4.txt"), sep='$', encoding='latin1',  low_memory=False)
drug = pd.read_csv(os.path.join(data_path, "DRUG20Q4.txt"), sep='$', encoding='latin1',  low_memory=False)
reac = pd.read_csv(os.path.join(data_path, "REAC20Q4.txt"), sep='$', encoding='latin1',  low_memory=False)


In [68]:
# The FAERS .txt files are large and contain mixed data types (e.g., strings and numbers in the same column).
# Setting low_memory=False tells pandas to read the entire file into memory before inferring data types.
# This avoids DtypeWarning messages and ensures more accurate and consistent column type detection.
# low_memory=True (default)

In [62]:
# Checking what data we have in Column 21
for i, col in enumerate(demo.columns):
    print(f"{i}: {col}")

0: primaryid
1: caseid
2: caseversion
3: i_f_code
4: event_dt
5: mfr_dt
6: init_fda_dt
7: fda_dt
8: rept_cod
9: auth_num
10: mfr_num
11: mfr_sndr
12: lit_ref
13: age
14: age_cod
15: age_grp
16: sex
17: e_sub
18: wt
19: wt_cod
20: rept_dt
21: to_mfr
22: occp_cod
23: reporter_country
24: occr_country


In [63]:
# Check the shape and preview
print("Demo shape:", demo.shape)
demo.head()

Demo shape: (436148, 25)


Unnamed: 0,primaryid,caseid,caseversion,i_f_code,event_dt,mfr_dt,init_fda_dt,fda_dt,rept_cod,auth_num,...,age_grp,sex,e_sub,wt,wt_cod,rept_dt,to_mfr,occp_cod,reporter_country,occr_country
0,100046573,10004657,3,F,20120731.0,20201030.0,20140312,20201103,EXP,,...,,F,Y,81.63,KG,20201103.0,,LW,US,US
1,100046962,10004696,2,F,,20201019.0,20140312,20201021,EXP,,...,,M,Y,,,20201021.0,,HP,PL,PL
2,100048793,10004879,3,F,20050908.0,20201030.0,20140312,20201102,EXP,,...,,F,Y,,,20201102.0,,LW,US,US
3,100051383,10005138,3,F,1999.0,20201018.0,20140312,20201020,EXP,,...,,F,Y,83.0,KG,20201020.0,,LW,US,US
4,100075524,10007552,4,F,199908.0,20201018.0,20140313,20201021,EXP,,...,,F,Y,90.7,KG,20201021.0,,LW,US,US


In [69]:
# Check the shape and preview
print("Drug shape:", drug.shape)
drug.head()

Drug shape: (1918927, 20)


Unnamed: 0,primaryid,caseid,drug_seq,role_cod,drugname,prod_ai,val_vbm,route,dose_vbm,cum_dose_chr,cum_dose_unit,dechal,rechal,lot_num,exp_dt,nda_num,dose_amt,dose_unit,dose_form,dose_freq
0,100046573,10004657,1,PS,LIPITOR,ATORVASTATIN CALCIUM,1,Oral,"40 MG, UNK",,,D,,,,20702.0,40.0,MG,FILM-COATED TABLET,
1,100046573,10004657,2,C,TOPROL XL,METOPROLOL SUCCINATE,1,,UNK,,,,,,,,,,,
2,100046962,10004696,1,PS,QUETIAPINE.,QUETIAPINE,1,Unknown,500 MG,,,,,,,78679.0,500.0,MG,,
3,100046962,10004696,2,I,CITALOPRAM,CITALOPRAM HYDROBROMIDE,1,Unknown,10 MG,,,,,,,77040.0,10.0,MG,,
4,100046962,10004696,3,I,CITALOPRAM,CITALOPRAM HYDROBROMIDE,1,,,,,,,,,77040.0,,,,


In [70]:
# Check the shape and preview
print("Reac shape:", reac.shape)
reac.head()

Reac shape: (1522657, 4)


Unnamed: 0,primaryid,caseid,pt,drug_rec_act
0,100046573,10004657,Type 2 diabetes mellitus,
1,100046962,10004696,Abnormal behaviour,
2,100046962,10004696,Drug interaction,
3,100046962,10004696,Energy increased,
4,100046962,10004696,Irritability,


## ðŸ§  Why Join the files/tables DEMO, DRUG, and REAC?

Each FAERS file contains a part of a case:

- `DEMO`: patient info (age, sex, country)
- `DRUG`: drugs involved (names, role codes)
- `REAC`: adverse reactions reported

All are linked by `primaryid`, which represents one safety report.

Joining them gives a full view of:
- What happened (reaction)
- Who it happened to (patient)
- What drug(s) were involved

This allows:
- Analyzing side effects by demographics
- Identifying drugs with high-risk profiles
- Building visualizations for drug safety trends

In [64]:
# Quick check
print("Demo shape:", demo.shape)
print("Drug shape:", drug.shape)
print("Reac shape:", reac.shape)

Demo shape: (436148, 25)
Drug shape: (1918927, 20)
Reac shape: (1522657, 4)
