In [2]:
import pandas as pd 
import numpy as np
import html
import matplotlib.pyplot as plt
import json
import requests
import warnings
import pickle
warnings.filterwarnings("ignore")

def is_meaningful_review(text):
    """
    Check if the review is meaningful (not just symbols or extremely short).
    Args:
    text (str): The review text to evaluate.

    Returns:
    bool: True if the text is considered a meaningful review, False otherwise.
    """
    return len(text) > 15 and any(char.isalpha() for char in text)

def clean_dataset(file_path):
    """
    Load and clean the dataset, filtering out invalid entries and decoding HTML entities in reviews.
    Args:
    file_path (str): The path to the dataset file.

    Returns:
    DataFrame: The cleaned pandas DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path, sep='\t', index_col=0)

    # Clean the data
    cleaned_data = data[
        data['drugName'].notna() & 
        data['condition'].notna() & 
        data['condition'].apply(lambda x: isinstance(x, str) and not x.isdigit() and " users found this comment helpful." not in x) &
        data['review'].apply(is_meaningful_review)
    ]

    # Decode HTML entities in the review column
    cleaned_data['review'] = cleaned_data['review'].apply(html.unescape)

    return cleaned_data

In [3]:
### Reviews dataset
raw_df_1 = clean_dataset("../application/data/drugsComTest_raw.tsv")
raw_df_2 = clean_dataset("../application/data/drugsComTrain_raw.tsv")
df = pd.concat([raw_df_1, raw_df_2], ignore_index=True)
df["drug"] = df["drugName"].apply(lambda x: x.split(" ")[0].lower())

In [4]:
with open("/Users/sid98/OMSA/04_CSE_6242_DVA/ProjectFiles/FinalFDAData/drug-drugsfda-0001-of-0001.json", "r") as file:
    fda = json.load(file)
products = []
cnt = 0
for item in fda["results"]:
    try:
        for pr in item["products"]:
            pr["applnNo"] = item["application_number"]
            products.append(pr)
    except:
        cnt += 1
        continue
products = pd.DataFrame(products)

In [5]:
with open("/Users/sid98/OMSA/04_CSE_6242_DVA/ProjectFiles/FinalFDAData/drug-ndc-0001-of-0001.json", "r") as file:
    ndc = json.load(file)
ndc = pd.DataFrame(ndc["results"])
ndc = ndc[['product_ndc', 'generic_name', 'labeler_name', 'brand_name', 'active_ingredients', 'finished', 'dosage_form', 'product_type', 'route', 'marketing_start_date', 'product_id', 'application_number', 'brand_name_base', 'pharm_class']]

In [6]:
reviewDrugs = list(set(df["drug"].unique()))
products["brandName"] = products["brand_name"].apply(lambda x: x.split(" ")[0].lower())
ndc["brandName"] = ndc["brand_name"].apply(lambda x: x.split(" ")[0].lower() if str(x) != "nan" else np.nan)
mapping = {}
temp = products[["brandName", "applnNo"]].drop_duplicates(subset=["brandName"]).set_index("brandName")
for drug in reviewDrugs:
    if drug in temp.index:
        mapping[drug] = temp.loc[drug, "applnNo"]

temp = ndc[["brandName", "application_number"]].drop_duplicates(subset=["brandName"]).set_index("brandName")
for drug in reviewDrugs:
    if ((drug in temp.index) and (drug not in mapping)):
        appNo = temp.loc[drug, "application_number"]
        mapping[drug] = appNo

appNos = list(set(mapping.values()))
products = products[products.applnNo.isin(appNos)].reset_index(drop=True)
ndc = ndc[ndc.application_number.isin(appNos)].reset_index(drop=True)

In [7]:
def getLabels(appNo):
    rootUrl = "https://api.fda.gov/drug/label.json?limit=1&sort=effective_time:desc"
    filter = f"openfda.application_number:{appNo}"
    url = f"{rootUrl}&search={filter}"
    response = requests.get(url)
    response = response.json()["results"][0]
    items = ["indications_and_usage", "contraindications", "precautions", "pregnancy", "nursing_mothers", "pediatric_use", "adverse_reactions"]
    cols = list(set(items).intersection(set(response.keys())))
    result = {"applnNo": appNo}
    for item in items:
        if item in cols:
            result[item] = " ".join(response[item][0].split(" ")[len(item.split("_")):])
        else:
            result[item] = np.nan
    return result

def adverseEvents(appNo):
    baseUrl = "https://api.fda.gov/drug/event.json?limit=1000"
    filter = f"patient.drug.openfda.application_number:{appNo}"
    count = {}
    for col in ["serious", "seriousnessdeath", "seriousnessdisabling", "seriousnesshospitalization", "seriousnesslifethreatening"]:
        try:
            url = f"{baseUrl}&count={col}&search={filter}"
            response = requests.get(url)
            count[col] = response.json()["results"]
        except:
            continue
            
    res = []
    for k,val in count.items():
        for item in val:
            term = item["term"]
            if term == 1:
                c = item["count"]
                res.append({
                    "count": c,
                    "event": k
                })
    res = pd.DataFrame(res)
    if len(res) != 0:
        res.loc[:,"applnNo"] = appNo
    return res

def getReactions(appNo):
    reactions = []

    baseUrl = 'https://api.fda.gov/drug/event.json?count=patient.patientweight'
    baseFilter = f'_exists_:patient.patientweight+AND+_exists_:patient.patientsex+AND+_exists_:patient.patientagegroup+AND+patient.drug.openfda.application_number:{appNo}'
    
    for s in ["1", "2"]:
        for a in ["1", "2", "3", "4", "5", "6"]:
            sexFilter = f'patient.patientsex:{s}'
            ageFilter = f'patient.patientagegroup:{a}'
            url = f'{baseUrl}&search={baseFilter}+AND+{sexFilter}+AND+{ageFilter}&limit=1000'
            response = requests.get(url)
            
            if response.status_code == 200:
                weights = [item["term"] for item in response.json()["results"]]
                weightFilter1 = ""
                weightFilter2 = ""
                weightFilter3 = ""

                for weight in weights:
                    if weight < 45:
                        weightFilter1 += f'"{weight}"+OR+'
                    elif weight > 95:
                        weightFilter3 += f'"{weight}"+OR+'
                    else:
                        weightFilter2 += f'"{weight}"+OR+'

                weightFilter1 = f"patient.patientweight:({weightFilter1[:-4]})" if weightFilter1 != "" else ""
                weightFilter2 = f"patient.patientweight:({weightFilter2[:-4]})" if weightFilter2 != "" else ""
                weightFilter3 = f"patient.patientweight:({weightFilter3[:-4]})" if weightFilter3 != "" else ""
                
                baseUrl2 = 'https://api.fda.gov/drug/event.json?count=patient.reaction.reactionmeddrapt.exact'
                for wf,name in zip([weightFilter1, weightFilter2, weightFilter3], ["l", "n", "h"]):
                    if wf != "":
                        url2 = f'{baseUrl2}&search={baseFilter}+AND+{sexFilter}+AND+{ageFilter}+AND+{wf}&limit=3'
                        response2 = requests.get(url2).json()["results"]
                        for item in response2:
                            reactions.append({
                                "age": a,
                                "sex": s,
                                "weight": name,
                                "reaction": item["term"],
                                "count": item["count"]
                            })
            elif response.status_code == 429:
                print("Exceeded Rate limit")
                return pd.DataFrame()      
            else:
                continue

    reactions = pd.DataFrame(reactions)
    if len(reactions) > 0:
        reactions.loc[:,"applnNo"] = appNo
    return reactions


In [8]:
drugs = df.drug.unique()
appNos = list(set([mapping[drug] for drug in drugs if drug in mapping]))

In [9]:
# labels = []
# missingLabels = []
# for i,appNo in enumerate(appNos):
#     try:
#         labels.append(getLabels(appNo))
#     except KeyError:
#         missingLabels.append(appNos)

#     if i%200 == 0:
#         print(i)

# events = pd.DataFrame()
# missingEvents = []
# for i,appNo in enumerate(appNos[:500]):
#     try:
#         events = pd.concat([events, adverseEvents(appNo)], ignore_index=True)
#     except KeyError:
#         missingEvents.append(appNo)
#     if i%200 == 0:
#         print(i)
# events.to_csv("./Events.csv")

# reactions = pd.DataFrame()
# errors = []
# for i,appNo in enumerate(appNos):
#     try:
#         reactions = pd.concat([reactions, getReactions(appNo)], ignore_index=True)
#     except:
#         errors.append(appNo)
# reactions.to_csv(f"./LookupData/ReactionsData/Reactions.csv")

In [9]:
with open("./DrugMapping.json", "w") as file:
    json.dump(mapping, file)

In [11]:
with open("./AppNos.pickle", "wb") as file:
    pickle.dump(appNos, file)

In [15]:
reactions = pd.DataFrame()
import os
for file in os.listdir("./LookupData/ReactionsData/"):
    df = pd.read_csv(f"./LookupData/ReactionsData/{file}", index_col=0)
    reactions = pd.concat([reactions, df], ignore_index=True)
reactions.to_csv("/Users/sid98/OMSA/04_CSE_6242_DVA/ProjectFiles/LookupData/Reactions_3.csv")

In [10]:
reactions = pd.DataFrame()
import os
for file in os.listdir("./LookupData/ReactionsData/"):
    df = pd.read_csv(f"./LookupData/ReactionsData/{file}", index_col=0)
    reactions = pd.concat([reactions, df], ignore_index=True)
reactions.to_csv("/Users/sid98/OMSA/04_CSE_6242_DVA/ProjectFiles/LookupData/Reactions_5.csv")

In [13]:
total_calls = 400
iteration = 5
len(appNos[(iteration*total_calls):(iteration+1)*total_calls])

266

In [16]:
reaction

50.0

In [19]:
2266 - 2250

16

In [20]:
reactions_last = pd.DataFrame()
errors = []
for i,appNo in enumerate(appNos[2250:]):
    try:
        reactions_last = pd.concat([reactions_last, getReactions(appNo)], ignore_index=True)
    except:
        errors.append(appNo)
    if i%10 == 0:
        print(i)

0
10


In [24]:
reactions_last.to_csv("./LookupData/ReactionsData/Reactions_6_1.csv")