In [198]:
import pandas as pd 
import numpy as np
import html
import matplotlib.pyplot as plt
from rapidfuzz import process
import warnings
import requests
import fitz 
import re
import json
warnings.filterwarnings("ignore")

In [199]:
def is_meaningful_review(text):
    """
    Check if the review is meaningful (not just symbols or extremely short).
    Args:
    text (str): The review text to evaluate.

    Returns:
    bool: True if the text is considered a meaningful review, False otherwise.
    """
    return len(text) > 15 and any(char.isalpha() for char in text)

def clean_dataset(file_path):
    """
    Load and clean the dataset, filtering out invalid entries and decoding HTML entities in reviews.
    Args:
    file_path (str): The path to the dataset file.

    Returns:
    DataFrame: The cleaned pandas DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path, sep='\t', index_col=0)

    # Clean the data
    cleaned_data = data[
        data['drugName'].notna() & 
        data['condition'].notna() & 
        data['condition'].apply(lambda x: isinstance(x, str) and not x.isdigit() and " users found this comment helpful." not in x) &
        data['review'].apply(is_meaningful_review)
    ]

    # Decode HTML entities in the review column
    cleaned_data['review'] = cleaned_data['review'].apply(html.unescape)

    return cleaned_data

In [200]:
### Original Dataset
raw_df_1 = clean_dataset("./application/data/drugsComTest_raw.tsv")
raw_df_2 = clean_dataset("./application/data/drugsComTrain_raw.tsv")
df = pd.concat([raw_df_1, raw_df_2], ignore_index=True)
df.loc[:,"drugName"] = df.drugName.apply(lambda x: x.lower())
df.loc[:,"drugName2"] = df.drugName.apply(lambda x: x.split(" ")[0].lower())

In [203]:
df.to_csv("./Review_Dataset.csv")

In [195]:
def getBrandNames(drug, CNT):
    url = f"https://api.fda.gov/drug/ndc.json?search=generic_name:{drug}*+brand_name:{drug}*&limit=1000"
    resp = requests.get(url)
    if resp.status_code == 200:
        resp = resp.json()["results"]
        resp_df = pd.DataFrame(resp)
        cols_to_keep = ["brand_name_base", "generic_name", "labeler_name", "brand_name", "active_ingredients", "route"]
        missing_cols = list(set(cols_to_keep).difference(set(resp_df.columns)))
        resp_df = resp_df[list(set(resp_df.columns).intersection(set(cols_to_keep)))]
        for col in missing_cols:
            resp_df[col] = np.nan
        resp_df["active_ingredients"] = resp_df.active_ingredients.apply(lambda x: x[0]["name"] if str(x) != "nan" else np.nan)
        resp_df["route"] = resp_df["route"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)
        resp_df = resp_df.drop_duplicates().dropna().reset_index(drop=True)
    else:
        CNT += 1
        return pd.DataFrame(), CNT
    return resp_df, CNT

In [196]:
mapping = {}
drugs = list(df.drugName2.unique())
print(len(drugs))

2964


In [197]:
for i,drug in enumerate(drugs):
    res, CNT = getBrandNames(drug, CNT)
    mapping[drug] = res
    if i%300 == 0:
        print(i)

0


KeyboardInterrupt: 

In [162]:
medsDataset = pd.read_csv("./application/data/medicine_dataset.csv", index_col=0)
medsDataset = medsDataset[["Chemical Class", "Action Class", "Therapeutic Class"] + ["name", "substitute0", "substitute1", "substitute2", "substitute3", "substitute4"]]
for col in medsDataset:
    medsDataset[col] = medsDataset[col].apply(lambda x: x.lower() if str(x) != "nan" else "")
    medsDataset[col+"2"] = medsDataset[col].apply(lambda x: x.split(" ")[0].lower() if str(x) != "nan" else "")
medsDataset = medsDataset.drop_duplicates().reset_index(drop=True)

In [186]:
for col in medsDataset:
    print(col, len(medsDataset[medsDataset[col].str.contains("mavik")]))

Chemical Class 0
Action Class 0
Therapeutic Class 0
name 0
substitute0 0
substitute1 0
substitute2 0
substitute3 0
substitute4 0
Chemical Class2 0
Action Class2 0
Therapeutic Class2 0
name2 0
substitute02 0
substitute12 0
substitute22 0
substitute32 0
substitute42 0


In [190]:
df[df.drugName2 == "mavik"]

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,drugName2
211658,mavik,Heart Failure,"""Developed over several weeks two side effects...",2.0,"June 23, 2014",10,mavik


In [78]:
merged_df["brandName"] = merged_df["ApplNo"].apply(lambda x: mapping[x] if x in mapping else "")
merged_df["genericName"] = merged_df["ApplNo"].apply(lambda x: mapping2[x] if x in mapping2 else "")

merged_df["brandName2"] = merged_df["ApplNo"].apply(lambda x: mapping[x].split(" ")[0] if x in mapping else "")
merged_df["genericName2"] = merged_df["ApplNo"].apply(lambda x: mapping2[x].split(" ")[0] if x in mapping2 else "")

In [79]:
temp = list(merged_df.genericName.unique())
temp = [item for item in temp if item != ""]

temp2 = list(merged_df.brandName.unique())
temp2 = [item for item in temp2 if item != ""]

temp3 = list(merged_df.brandName2.unique())
temp3 = [item for item in temp3 if item != ""]

temp4 = list(merged_df.brandName2.unique())
temp4 = [item for item in temp4 if item != ""]


In [84]:
medsDataset[medsDataset["Chemical Class"].isin(temp)].shape[0], medsDataset[medsDataset["Chemical Class"].isin(temp2)].shape[0], medsDataset[medsDataset["Chemical Class"].isin(temp3)].shape[0], medsDataset[medsDataset["Chemical Class"].isin(temp4)].shape[0]

(461, 461, 534, 534)

In [85]:
medsDataset[medsDataset["Action Class"].isin(temp)].shape[0], medsDataset[medsDataset["Action Class"].isin(temp2)].shape[0], medsDataset[medsDataset["Action Class"].isin(temp3)].shape[0], medsDataset[medsDataset["Action Class"].isin(temp4)].shape[0]

(0, 0, 182, 182)