In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from scipy.stats import nbinom, gamma, chi2_contingency
from scipy.special import digamma
from scipy.optimize import minimize


In [2]:
def download_faer_files(root_dir):
    # Define root directory where FAERS datasets are located
    quarters = ['Q1', 'Q2', 'Q3', 'Q4']

    # Adjust range from 2013 to 2025
    all_quarters = [f"{str(y)[2:]}{q}" for y in range(2013, 2026) for q in quarters]

    # Limit to only those quarters that exist (sanity check)
    existing_quarters = [q for q in all_quarters if os.path.exists(os.path.join(root_dir, f"DEMO{q}.txt"))]

    # Store DataFrames in a dict
    data = {}

    for q in existing_quarters:
        try:
            data[q] = {
                'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
                'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
                'reac': pd.read_csv(os.path.join(root_dir, f"REAC{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
                'outc': pd.read_csv(os.path.join(root_dir, f"OUTC{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
                'indi': pd.read_csv(os.path.join(root_dir, f"INDI{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
                'rpsr': pd.read_csv(os.path.join(root_dir, f"RPSR{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
                'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
            }
            print(f"Loaded {q} successfully.")
        except Exception as e:
            print(f"Error loading data for {q}: {e}")

    return data



In [3]:
def generate_periods(start_year, start_quarter, end_year, end_quarter):
    periods = []
    quarters = ['Q1', 'Q2', 'Q3', 'Q4']

    start_year = int(start_year)
    end_year = int(end_year)

    # Loop through the years and quarters to generate all periods in the range
    for year in range(start_year, end_year + 1):
        start_qtr = start_quarter if year == start_year else 'Q1'
        end_qtr = end_quarter if year == end_year else 'Q4'

        for qtr in quarters[quarters.index(start_qtr):quarters.index(end_qtr) + 1]:
            periods.append(f"{str(year)[-2:]}{qtr}")

    return periods


In [4]:
def create_dataframes(start_year, start_quarter, end_year, end_quarter):
    # Generate periods based on user input
    periods = generate_periods(start_year, start_quarter, end_year, end_quarter)

    # List of table types to be processed (can easily be extended)
    table_types = ['demo', 'drug', 'reac', 'outc', 'indi', 'rpsr', 'ther']

    # Initialize a dictionary to store lists of DataFrames for each table
    data_dict = {table: [] for table in table_types}

    # Loop through each period and collect data for each table type
    for period in periods:
        if period in data:  # Check if data for the period exists
            for table in table_types:
                if table in data[period]:
                    data_dict[table].append(data[period][table])
        else:
            print(f"Warning: No data available for {period}")

    # Concatenate data for each table type into a single DataFrame
    merged_data = {
        table: pd.concat(data_dict[table], ignore_index=True) if data_dict[table] else pd.DataFrame()
        for table in table_types
    }

    demo = merged_data['demo']
    drug = merged_data['drug']
    reac = merged_data['reac']
    outc = merged_data['outc']
    indi = merged_data['indi']
    rpsr = merged_data['rpsr']
    ther = merged_data['ther']

    return demo, drug, reac, outc, indi, rpsr, ther


In [5]:
def preprocess_drug_df(drug):
    drug = drug[['primaryid', 'caseid', 'role_cod', 'drugname', 'prod_ai']]
    drug = drug[drug['role_cod'] == 'PS']

    drug = drug[pd.notnull(drug['drugname'])]  # Drops Nulls
    drug['drugname'] = drug['drugname'].str.strip().str.lower()  # Stips whitespace, Transforms to lowercase
    drug = drug[~drug['drugname'].isin(['unknown'])]  # Drops unknowns
    drug['drugname'] = drug['drugname'].str.replace('\\', '/')  # Standardizes slashes to '/'
    drug['drugname'] = drug['drugname'].map(
        lambda x: x[:-1] if str(x).endswith(".") else x)  # Removes periods at the end of drug names

    return drug


In [7]:
def preprocess_reac_df(reac):
    reac = reac[pd.notnull(reac['pt'])] # Drops Nulls
    reac['pt'] = reac['pt'].str.strip().str.lower()  # Transforms to lowercase
    reac = reac[~reac['pt'].isin(['unknown'])]  # Drops unknowns
    reac['pt'] = reac['pt'].map(
        lambda x: x[:-1] if str(x).endswith(".") else x)  # Removes periods at the end of drug names

    return reac


In [17]:
def preprocess_demo_df(demo):
    demo = demo[['primaryid', 'caseid', 'caseversion', 'age_cod', 'age', 'sex', 'wt']]
    demo = demo.sort_values('caseversion', ascending=False).drop_duplicates(subset='caseid', keep='last')

    demo = demo[pd.notnull(demo['age'])]
    demo = demo[demo.age_cod != 'dec'].reset_index(drop=True)
    demo['age'] = demo['age'].apply(pd.to_numeric, errors='coerce')
    demo['age'] = np.where(demo['age_cod'] == 'MON', demo['age'] * 1 / 12, demo['age'])  # mounth
    demo['age'] = np.where(demo['age_cod'] == 'WK', demo['age'] * 1 / 52, demo['age'])  # week
    demo['age'] = np.where(demo['age_cod'] == 'DY', demo['age'] * 1 / 365, demo['age'])  # day
    demo['age'] = np.where(demo['age_cod'] == 'HR', demo['age'] * 1 / 8760, demo['age'])  # hour
    demo = demo.drop(['age_cod'], axis=1)

    return demo


In [9]:
# Function to compute Odds Ratio (OR) and Confidence Interval (CI)
def compute_or_and_ci(a, b, c, d):
    # Odds Ratio
    or_val = (a * d) / (b * c)

    # Log(OR) and standard error
    log_or = np.log(or_val)
    se = np.sqrt(1 / a + 1 / b + 1 / c + 1 / d)

    # Confidence Interval
    ci_low = np.exp(log_or - 1.96 * se)
    ci_high = np.exp(log_or + 1.96 * se)

    return or_val, ci_low, ci_high


# Function to compute Proportional Reporting Ratio (PRR), Standard Error (SE), and Confidence Interval (CI)
def compute_prr_and_ci(a, b, c, d):
    # Proportional Reporting Ratio (PRR)
    prr = (a / (a + b)) / (c / (c + d))

    # Standard Error (SE)
    se = np.sqrt(1 / a + 1 / c - 1 / (a + b) - 1 / (c + d))

    # Confidence Interval for PRR
    ln_prr = np.log(prr)
    ci_low = np.exp(ln_prr - 1.96 * se)
    ci_high = np.exp(ln_prr + 1.96 * se)

    return prr, se, ci_low, ci_high


# Function to add statistics (OR and PRR) to the DataFrame
def add_stats(df):
    # Add 0.5 to all columns to avoid zero counts
    a = df['Count_query_drug'] + 0.5
    b = df['No_AE_query_drug'] + 0.5
    c = df['Count_non_query_drug'] + 0.5
    d = df['No_AE_non_query_drug'] + 0.5

    # Compute Odds Ratio (OR) and Confidence Interval (CI) using vectorized operations
    or_vals, ci_low_or, ci_high_or = compute_or_and_ci(a, b, c, d)

    # Compute Proportional Reporting Ratio (PRR), SE, and Confidence Interval (CI) using vectorized operations
    prr_vals, se_prr, ci_low_prr, ci_high_prr = compute_prr_and_ci(a, b, c, d)

    # Perform chi-squared test for each row in a vectorized way
    p_values = df.apply(lambda row: chi2_contingency([[row['Count_query_drug'], row['No_AE_query_drug']],
                                                      [row['Count_non_query_drug'], row['No_AE_non_query_drug']]])[1],
                        axis=1)

    """
    ebgm_result = phv_ebgm(a, b, c, d, alpha = 0.05) 
    ebgm_result = ebgm_result.rename(columns={
        'ebgm': 'ebgm',
        'ci_low': 'ci_lower_ebgm',
        'ci_high': 'ci_upper_ebgm'
    })
    """

    # Add new columns for OR, PRR, and p-value statistics
    df['odds_ratio'] = or_vals
    df['ci_lower_or'] = ci_low_or
    df['ci_upper_or'] = ci_high_or
    df['prr'] = prr_vals
    df['se_prr'] = se_prr
    df['ci_lower_prr'] = ci_low_prr
    df['ci_upper_prr'] = ci_high_prr
    df['p_value'] = p_values
    # df = pd.concat([df, ebgm_result], axis=1)

    print(df)
    return df



In [22]:
root_dir = '/Users/jodie/Documents/BMI 212/faers-cohort-generation/FAERS-data-toolkit-master/FAERSdata'
data = download_faer_files(root_dir)


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 13Q1 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 13Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 13Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 13Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 14Q1 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 14Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 14Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 14Q4 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 15Q1 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 15Q2 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 15Q3 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 15Q4 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 16Q1 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 16Q2 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 16Q3 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 16Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 17Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 17Q2 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 17Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 17Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 18Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 18Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 18Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 18Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 19Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 19Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 19Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 19Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 20Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 20Q2 successfully.


  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 20Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 20Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 21Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 21Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'ther': pd.read_csv(os.path.join(root_dir, f"THER{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 21Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 21Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 22Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 22Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 22Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 22Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 23Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 23Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 23Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 23Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 24Q1 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 24Q2 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 24Q3 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),
  'drug': pd.read_csv(os.path.join(root_dir, f"DRUG{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 24Q4 successfully.


  'demo': pd.read_csv(os.path.join(root_dir, f"DEMO{q}.txt"), delimiter='$', encoding='ISO-8859-1'),


Loaded 25Q1 successfully.


In [27]:
start_year = 2024
start_quarter = 'Q1'
end_year = 2024
end_quarter = 'Q1'

demo, drug, reac, outc, indi, rpsr, ther = create_dataframes(start_year, start_quarter, end_year, end_quarter)


In [28]:
demo = preprocess_demo_df(demo)
drug = preprocess_drug_df(drug)
reac = preprocess_reac_df(reac)


In [30]:
query_drug = 'edaravone'

drug = pd.merge(drug, demo, on='primaryid', how='left')

# Finds reports related to the query drug
query_drug_df = drug[
    drug['drugname'].str.lower().str.contains(query_drug, na=False) |
    drug['prod_ai'].str.lower().str.contains(query_drug, na=False)
]

In [31]:
query_drug_df

Unnamed: 0,primaryid,caseid_x,role_cod,drugname,prod_ai,caseid_y,caseversion,age,sex,wt
42622,226230622,22623062,PS,radicava ors,EDARAVONE,,,,,
45502,227044184,22704418,PS,edaravone,EDARAVONE,22704418.0,4.0,69.0,F,
49057,228023685,22802368,PS,edaravone,EDARAVONE,22802368.0,5.0,86.0,M,
74696,232449572,23244957,PS,edaravone,EDARAVONE,,,,,
77305,232688512,23268851,PS,edaravone,EDARAVONE,23268851.0,2.0,60.0,M,
...,...,...,...,...,...,...,...,...,...,...
375654,236561181,23656118,PS,radicava ors,EDARAVONE,,,,,
375655,236561191,23656119,PS,radicava ors,EDARAVONE,,,,,
385129,236665261,23666526,PS,radicava,EDARAVONE,,,,,
385130,236665271,23666527,PS,radicava,EDARAVONE,,,,,


In [32]:
# Get AE counts for cases where query drug is mentioned
query_drug_ids = query_drug_df['primaryid'].unique()
query_drug_reac = reac[reac['primaryid'].isin(query_drug_ids)]
ae_counts = query_drug_reac['pt'].value_counts().reset_index()
ae_counts.columns = ['Adverse_Event', 'Count']

# Get AE counts for cases where query drug is not mentioned
non_query_drug = drug[~drug['primaryid'].isin(query_drug_ids)]
non_query_ids = non_query_drug['primaryid'].unique()
non_query_reac = reac[reac['primaryid'].isin(non_query_ids)]
non_ae_counts = non_query_reac['pt'].value_counts().reset_index()
non_ae_counts.columns = ['Adverse_Event', 'Count']

ae_counts.columns = ['Adverse_Event', 'Count_query_drug']
non_ae_counts.columns = ['Adverse_Event', 'Count_non_query_drug']

# Merge AE counts for query drug and non-query drug
ae_comparison = pd.merge(
    ae_counts,
    non_ae_counts,
    on='Adverse_Event',
    how='outer'
)

# Filter to AEs with at least 3 reports for the query drug
ae_filtered = ae_comparison[
    (ae_comparison['Count_query_drug'].notna()) &
    (ae_comparison['Count_query_drug'] >= 3)
].copy()

In [33]:
# Calculates the number of reports that did not include a specific adverse
# event for both the query drug and non-query drugs
query_num = query_drug_ids.shape[0]
non_num = non_query_ids.shape[0]
ae_filtered['No_AE_query_drug'] = query_num - ae_filtered['Count_query_drug']
ae_filtered['No_AE_non_query_drug'] = non_num - ae_filtered['Count_non_query_drug']


In [34]:
ae_filtered_new = add_stats(ae_filtered)

                                          Adverse_Event  Count_query_drug  \
493                       amyotrophic lateral sclerosis              11.0   
2216                               cerebral haemorrhage               3.0   
2219                                cerebral infarction               6.0   
3019                                              death              29.0   
3322                                disease progression              23.0   
3542                                           dyspnoea               3.0   
4569                          gastrointestinal disorder               3.0   
4995                                           headache               4.0   
5713   inappropriate schedule of product administration               3.0   
6820                                            malaise               3.0   
8006                                      off label use               5.0   
8909                               pneumonia aspiration               5.0   