In [202]:
import pandas as pd 
import numpy as np
import html
import matplotlib.pyplot as plt
from rapidfuzz import process
import warnings
import requests
import fitz 
import re
import json
warnings.filterwarnings("ignore")

In [203]:
def is_meaningful_review(text):
    """
    Check if the review is meaningful (not just symbols or extremely short).
    Args:
    text (str): The review text to evaluate.

    Returns:
    bool: True if the text is considered a meaningful review, False otherwise.
    """
    return len(text) > 15 and any(char.isalpha() for char in text)

def clean_dataset(file_path):
    """
    Load and clean the dataset, filtering out invalid entries and decoding HTML entities in reviews.
    Args:
    file_path (str): The path to the dataset file.

    Returns:
    DataFrame: The cleaned pandas DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path, sep='\t', index_col=0)

    # Clean the data
    cleaned_data = data[
        data['drugName'].notna() & 
        data['condition'].notna() & 
        data['condition'].apply(lambda x: isinstance(x, str) and not x.isdigit() and " users found this comment helpful." not in x) &
        data['review'].apply(is_meaningful_review)
    ]

    # Decode HTML entities in the review column
    cleaned_data['review'] = cleaned_data['review'].apply(html.unescape)

    return cleaned_data

### Original Dataset

In [204]:
raw_df_1 = clean_dataset("./application/data/drugsComTest_raw.tsv")
raw_df_2 = clean_dataset("./application/data/drugsComTrain_raw.tsv")
df = pd.concat([raw_df_1, raw_df_2], ignore_index=True)
df.loc[:,"drugName"] = df.drugName.apply(lambda x: x.lower())
df.loc[:,"drugName2"] = df.drugName.apply(lambda x: x.split(" ")[0].lower())

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

#### Kaggle Brand Name Dataset

In [None]:
medsDataset = pd.read_csv("./application/data/medicine_dataset.csv", index_col=0)
medsDataset = medsDataset[["Chemical Class", "Action Class", "Therapeutic Class"] + ["name", "substitute0", "substitute1", "substitute2", "substitute3", "substitute4"]]
for col in medsDataset:
    medsDataset[col] = medsDataset[col].apply(lambda x: x.lower() if str(x) != "nan" else "")
medsDataset = medsDataset.drop_duplicates().reset_index(drop=True)
medsNames = list(set(np.concatenate(medsDataset.values).ravel().tolist()))
medsNames = [item for item in medsNames]
medsNames2 = [item.split(" ")[0] for item in medsNames]

#### FDA Product Dataset

In [None]:
with open("./application/data/Products.txt", "r") as file:
    lines = file.readlines()
prodDataset = []
columns = lines[0].split("\t")
for line in lines[1:]:
    temp = {}
    for col,item in zip(columns, line.split("\t")):
        temp[col] = item
    prodDataset.append(temp)
prodDataset = pd.DataFrame(prodDataset)
prodDataset = prodDataset[["Form", "DrugName", "ActiveIngredient"]]
prodDataset = prodDataset.drop_duplicates().reset_index()
prodDataset["drugName"] = prodDataset.DrugName.apply(lambda x: x.lower())
prodDataset["drugName2"] = prodDataset.DrugName.apply(lambda x: x.split(" ")[0].lower())

In [None]:
merged_df = df.merge(prodDataset, left_on="drugName2", right_on="drugName2", how="left")

In [None]:
ing = merged_df["ActiveIngredient"].apply(lambda x: x.lower() if str(x) != "nan" else "").unique()
ing2 = merged_df["ActiveIngredient"].apply(lambda x: x.split(" ")[0].lower() if str(x) != "nan" else "").unique()

print(len(set(ing).intersection(set(medsNames))))
print(len(set(ing2).intersection(set(medsNames))))
print(len(set(ing2).intersection(set(medsNames2))))

7
8
109


In [None]:
with open("/Users/sid98/Downloads/drug-ndc-0001-of-0001.json", "r") as file:
    products = json.load(file)

mapping = {}
mapping2 = {}
for item in products["results"]:
    try:
        id = re.sub(r'[^0-9]', '', item["application_number"])
        mapping[id] = item["brand_name"].lower()
        mapping2[id] = item["generic_name"].lower()
    except:
        continue

In [None]:
medsDataset[medsDataset["Chemical Class"].str.contains("levetir")]["Chemical Class"].value_counts()

Chemical Class
levetiracetam derivative    126
Name: count, dtype: int64

In [None]:
len(set(res2["brandName"].unique()).intersection(set(medsNames2)))

348

In [None]:
drugs = list(df["drugName2"].unique())

In [None]:
def getBrandNames(drug):
    url = f"https://api.fda.gov/drug/ndc.json?search=generic_name:{drug}*&limit=1000"
    resp = requests.get(url)
    if resp.status_code == 200:
        resp = resp.json()["results"]
        resp_df = pd.DataFrame(resp)
        cols_to_keep = ["brand_name_base", "generic_name", "labeler_name", "brand_name", "active_ingredients", "route"]
        missing_cols = list(set(cols_to_keep).difference(set(resp_df.columns)))
        print(missing_cols)
        resp_df = resp_df[list(set(resp_df.columns).intersection(set(cols_to_keep)))]
        for col in missing_cols:
            resp_df[col] = np.nan
        resp_df["active_ingredients"] = resp_df.active_ingredients.apply(lambda x: x[0]["name"] if str(x) != "nan" else np.nan)
        resp_df["route"] = resp_df["route"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
        resp_df = resp_df.drop_duplicates().dropna().reset_index(drop=True)
    else:
        return pd.DataFrame()
    return resp_df

In [None]:

for drug in drugs[300:]:
    res = getBrandNames(drug)

[]
[]
[]
[]
['route', 'brand_name']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['route']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


KeyboardInterrupt: 