In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import html
import json
import requests
import warnings
warnings.filterwarnings("ignore")
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from sklearn.preprocessing import MinMaxScaler

os.chdir("/Users/sid98/OMSA/04_CSE_6242_DVA/ProjectFiles/dva-project/")

def is_meaningful_review(text):
    """
    Check if the review is meaningful (not just symbols or extremely short).
    Args:
    text (str): The review text to evaluate.

    Returns:
    bool: True if the text is considered a meaningful review, False otherwise.
    """
    return len(text) > 15 and any(char.isalpha() for char in text)

def clean_dataset(file_path):
    """
    Load and clean the dataset, filtering out invalid entries and decoding HTML entities in reviews.
    Args:
    file_path (str): The path to the dataset file.

    Returns:
    DataFrame: The cleaned pandas DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path, sep='\t', index_col=0)

    # Clean the data
    cleaned_data = data[
        data['drugName'].notna() & 
        data['condition'].notna() & 
        data['condition'].apply(lambda x: isinstance(x, str) and not x.isdigit() and " users found this comment helpful." not in x) &
        data['review'].apply(is_meaningful_review)
    ]

    # Decode HTML entities in the review column
    cleaned_data['review'] = cleaned_data['review'].apply(html.unescape)

    return cleaned_data

In [2]:
sentimentData = pd.read_csv("./application/data/final_result_df_v1.csv")

labels = pd.read_csv("./application/data/LookupData/Labels.csv", index_col=0)
reactions = pd.read_csv("././application/data/LookupData/Reactions.csv", index_col=0)
events = pd.read_csv("./application/data/LookupData/Events.csv", index_col=0)

raw_df_1 = clean_dataset("./application/data/drugsComTest_raw.tsv")
raw_df_2 = clean_dataset("./application/data/drugsComTrain_raw.tsv")
reviewsDf = pd.concat([raw_df_1, raw_df_2], ignore_index=True)
reviewsDf["drug"] = reviewsDf["drugName"].apply(lambda x: x.split(" ")[0].lower())

with open("./application/data/LookupData/DrugMapping.json") as file:
    mapping = json.load(file)

tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")
nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)

#### Pick Conditions For Testing

In [37]:
def findData(condition):
    drugs = sentimentData[sentimentData.condition == condition].drugName.apply(lambda x: x.split(" ")[0].lower())
    res = []
    for drug in drugs:
        if drug in mapping:
            appNo = mapping[drug]
            res.append({
                "drug": drug,
                "applnNo": appNo, 
                "Labels": 1 if appNo in list(labels.applnNo.values) else 0, 
                "Events": 1 if appNo in list(events.applnNo.values) else 0, 
                "Reactions": 1 if appNo in list(reactions.applnNo.values) else 0
                })
    res = pd.DataFrame(res)
    return res

In [38]:
bestMatches = []
for cond in sentimentData.condition.unique():
    res = findData(cond)
    totalDrugs = len(res)
    drugsWithAllValues = (res.iloc[:,2:].sum(axis=1) == 3).sum()

    bestMatches.append({
        "condition": cond,
        "totalDrugs": totalDrugs,
        "drugsWithAllValues": drugsWithAllValues
    })

In [39]:
bestMatches = pd.DataFrame(bestMatches).sort_values(by=["totalDrugs", "drugsWithAllValues"], ascending=False).reset_index(drop=True)
bestMatches = bestMatches[~bestMatches.condition.str.contains("</span>")].reset_index(drop=True)
bestMatches[((bestMatches.totalDrugs >= 4) & (bestMatches.drugsWithAllValues >= 3))]

Unnamed: 0,condition,totalDrugs,drugsWithAllValues
0,Basal Cell Carcinoma,5,5
1,Actinic Keratosis,5,4
2,Perioral Dermatitis,5,4
3,Angina,5,4
4,Social Anxiety Disorde,5,4
5,Emergency Contraception,5,4
6,"Diabetes, Type 2",5,4
7,"Glaucoma, Open Angle",5,4
8,Polycystic Ovary Syndrome,5,3
9,Atrophic Vaginitis,5,3


In [40]:
conditions = ["Perioral Dermatitis", "Left Ventricular Dysfunction", "Diabetes, Type 2", "Emergency Contraception", "Cluster Headaches", "Breast Cancer, Adjuvant"]

#### Helper Functions

In [11]:
def getData(condition):
    ### Get the drugs from sentiment data
    drugs = sentimentData[sentimentData.condition == condition].drugName.apply(lambda x: x.split(" ")[0].lower()).values

    ### Get all other information from the reviews data
    REVIEWS = reviewsDf[reviewsDf.drug.isin(drugs)]
    REVIEWS["date"] = REVIEWS["date"].apply(lambda x:pd.to_datetime(pd.to_datetime(x)))
    REVIEWS = REVIEWS.sort_values(by=["drug", "date"], ascending=True).reset_index(drop=True)
    for i in range(len(REVIEWS)):
        REVIEWS.loc[i,"sentiment"] = np.random.randint(-1,2) ### For some reason sentiment against each is not available in the dataset
        
    ### Filter the Lookup Dataset based on appNos
    appNos = [mapping[drug] for drug in drugs]
    LABELS = labels[labels.applnNo.isin(appNos)].set_index("applnNo")
    EVENTS = events[events.applnNo.isin(appNos)].reset_index(drop=True)
    REACTIONS = reactions[reactions.applnNo.isin(appNos)].reset_index(drop=True)

    return appNos, drugs, REVIEWS, LABELS, EVENTS, REACTIONS

def temporalRating(reviewsDf, drugs):
    fig = px.line(reviewsDf[reviewsDf.drug.isin(drugs[:3])], x="date", y="rating", color='drug')
    fig.show()

def sentimentAnalysis(reviewsDf, drugs):
    sentimentDf = reviewsDf.groupby(["drug", "sentiment"])["usefulCount"].sum().reset_index()
    sentimentDf = sentimentDf.sort_values(by=["drug", "sentiment"], ascending=True)
    sentimentDf["sentiment"] = sentimentDf["sentiment"].apply(lambda x: "Positive" if x == 1 else ("Neutral" if x == 0 else "Negative"))
    fig = px.histogram(sentimentDf[sentimentDf.drug.isin(drugs)], x="sentiment", y="usefulCount",
             color='drug', barmode='group')
    fig.show()

def getLabelCards(labels):
    cols = ["indications_and_usage", "contraindications", "pregnancy", "nursing_mothers", "pediatric_use"]
    names = ["DETAILED_DESCRIPTION", "DISEASE_DISORDER", "SIGN_SYMPTOM", "SIGN_SYMPTOM", "SIGN_SYMPTOM"]

    LABELS = []
    for appNo in appNos:
        temp = {"appNo": appNo}
        for col,name in zip(cols, names):
            try:
                if str(labels.loc[appNo, col]) != "nan":
                    tags = pd.DataFrame(nlp_token_class(labels.loc[appNo,col]))
                    tags = tags[((tags.entity_group == name) & (tags.score >= 0.8))].sort_values(by="score", ascending=False)
                    if len(tags) > 0:
                        signs = " || ".join([item for item in tags.word.unique()[:3] if "type" not in item])
                        temp[col] = signs
                else:
                    temp[col] = np.nan
            except KeyError:
                continue
        LABELS.append(temp)
    LABELS = pd.DataFrame(LABELS)
    return LABELS

def getAdverseEvents(events):
    labels = ["seriousnessdeath", "seriousnessdisabling", "seriousnesshospitalization", "seriousnesslifethreatening"]
    events2 = events[events.event.isin(labels)].drop_duplicates()
    events2
    fig = make_subplots(rows=1, cols=events2.applnNo.nunique(), specs=[[{'type':'domain'} for _ in range(events2.applnNo.nunique())]])
    for i,appNo in enumerate(events2.applnNo.unique()):
        fig.add_trace(go.Pie(labels=labels, values=list(events2[events2.applnNo == appNo]["count"].values)), 1, i+1)
    fig.show()

def getReactions(reactions, appNo):
    ageMap = {
        1 : "Neonate",
        2 : "Infant",
        3 : "Child",
        4 : "Adolescent",
        5 : "Adult",
        6 : "Elderly"}

    sexMap = {1: "Male", 2: "Female"}

    weightMap = {"l": "Underweight", "n": "Normal", "h": "Overweight"}

    df = reactions[((reactions.applnNo == appNo) & (reactions["count"] > 7))]
    df["sex"] = df["sex"].apply(lambda x: sexMap[x])
    df["age"] = df["age"].apply(lambda x: ageMap[x])
    df["weight"] = df["weight"].apply(lambda x: weightMap[x])

    fig = px.treemap(df, path=[px.Constant("all"), "sex", "age", "weight", "reaction"], values="count")

    fig.update_traces(root_color="lightgrey")
    fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
    fig.show()

def compareDrugs(REVIEWS, LABELS, EVENTS):
    REVIEWS_AGG = REVIEWS[REVIEWS.drug.isin(drugs)].groupby("drug").agg(
    AvgRating = ("rating", "mean"),
    Usefulness = ("usefulCount", "sum"),
    CommonSentiment = ("sentiment", pd.Series.mode))

    PRECAUTIONS = LABELS[["contraindications"]]
    PRECAUTIONS["precautionsCount"] = PRECAUTIONS["contraindications"].apply(lambda x: len(str(x).split("||")) if str(x) != "nan" else np.nan)
    PRECAUTIONS = PRECAUTIONS[["precautionsCount"]]

    SERIOUSCASES = EVENTS[EVENTS.event == "serious"].drop_duplicates().set_index("applnNo")
    SERIOUSCASES = SERIOUSCASES[["count"]]

    reverseMapping = {}
    for appNo in appNos:
        reverseMapping[appNo] = drugs[appNos.index(appNo)]
    PRECAUTIONS = PRECAUTIONS.rename(index=reverseMapping)
    SERIOUSCASES = SERIOUSCASES.rename(index=reverseMapping)

    AGG_DATA = REVIEWS_AGG.merge(PRECAUTIONS, left_index=True, right_index=True, how="left").merge(SERIOUSCASES, left_index=True, right_index=True, how="left")
    AGG_DATA = AGG_DATA.reset_index()
    AGG_DATA.CommonSentiment = AGG_DATA.CommonSentiment.apply(lambda x: x[0] if type(x) == np.ndarray else x)

    scaler = MinMaxScaler()
    AGG_DATA.iloc[:,1:] = scaler.fit_transform(AGG_DATA.iloc[:,1:].values)*100

    categories = AGG_DATA.columns[1:]
    fig = go.Figure()
    for drug in drugs:
        fig.add_trace(go.Scatterpolar(
            r=AGG_DATA[AGG_DATA.drug == drug].values[0][1:],
            theta=categories,
            fill='toself',
            name=drug))
    fig.show()

In [14]:
def vizualize(condition):
    appNos, drugs, REVIEWS, LABELS, EVENTS, REACTIONS = getData(condition)
    print(f"--------- For {condition}, these are the top recommended drugs...")
    for i in range(len(drugs)):
        print(f"{i+1}. {drugs[i]} - {appNos[i]}")
    

    print("Temporal Analysis of the rating for the top 3 drugs...")
    temporalRating(REVIEWS, drugs[:3])

    print("Sentiment Analysis for the top 3 drugs...")
    sentimentAnalysis(REVIEWS, drugs[:3])

    print("Precautionary Labels to be kept in mind for certain patients...")
    LABEL_CARDS = getLabelCards(LABELS)
    display(LABEL_CARDS)

    print("Adverse Events that has been reported in the past [Anything below 50k incidents is considered of low probability]...")
    getAdverseEvents(EVENTS)

    print(f"Reactions and Side effects to be aware of for different demographics for {drugs[0]}")
    getReactions(REACTIONS, appNos[0])

    print("Comparitive Study of top 5 drugs...")
    compareDrugs(REVIEWS, LABELS, EVENTS)

In [15]:
conditions = ["Perioral Dermatitis", "Left Ventricular Dysfunction", "Diabetes, Type 2", "Emergency Contraception", "Cluster Headaches", "Breast Cancer, Adjuvant"]

#### LVD

In [16]:
condition = conditions[1]
vizualize(condition)

--------- For Left Ventricular Dysfunction, these are the top recommended drugs...
1. coreg - NDA022012
2. enalapril - ANDA214467
3. toprol-xl - NDA019962
4. benazepril - ANDA076333
5. vasotec - NDA018998
Temporal Analysis of the rating for the top 3 drugs...


Sentiment Analysis for the top 3 drugs...


Precautionary Labels to be kept in mind for certain patients...


Unnamed: 0,appNo,contraindications,pregnancy,nursing_mothers,pediatric_use
0,NDA022012,cardiogenic shock || asthmaticus,hypotension || hypoglycemia || respiratory dep...,,dizziness || dyspnea
1,ANDA214467,,oliguria || hyperkalemia || anuria,,hypotension || oliguria
2,NDA019962,,hypotension || hypoglycemia || bradycardia,,
3,ANDA076333,,,,
4,NDA018998,,,,


Adverse Events that has been reported in the past [Anything below 50k incidents is considered of low probability]...


Reactions and Side effects to be aware of for different demographics for coreg


Comparitive Study of top 5 drugs...


### Diabetes, Type 2

In [19]:
condition = conditions[4]
vizualize(condition)

--------- For Cluster Headaches, these are the top recommended drugs...
1. deltasone - NDA009986
2. cyproheptadine - ANDA085245
3. verelan - NDA019614
4. verelan - NDA019614
5. imitrex - NDA020626
Temporal Analysis of the rating for the top 3 drugs...


Sentiment Analysis for the top 3 drugs...


Precautionary Labels to be kept in mind for certain patients...


Unnamed: 0,appNo
0,NDA022012
1,ANDA214467
2,NDA019962
3,ANDA076333
4,NDA018998


Adverse Events that has been reported in the past [Anything below 50k incidents is considered of low probability]...


Reactions and Side effects to be aware of for different demographics for deltasone


Comparitive Study of top 5 drugs...


ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required by MinMaxScaler.