In [233]:
import pandas as pd 
import numpy as np
import html
import matplotlib.pyplot as plt
from rapidfuzz import process

import warnings
warnings.filterwarnings("ignore")

In [234]:
import pandas as pd

def is_meaningful_review(text):
    """
    Check if the review is meaningful (not just symbols or extremely short).
    Args:
    text (str): The review text to evaluate.

    Returns:
    bool: True if the text is considered a meaningful review, False otherwise.
    """
    return len(text) > 15 and any(char.isalpha() for char in text)

def clean_dataset(file_path):
    """
    Load and clean the dataset, filtering out invalid entries and decoding HTML entities in reviews.
    Args:
    file_path (str): The path to the dataset file.

    Returns:
    DataFrame: The cleaned pandas DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path, sep='\t', index_col=0)

    # Clean the data
    cleaned_data = data[
        data['drugName'].notna() & 
        data['condition'].notna() & 
        data['condition'].apply(lambda x: isinstance(x, str) and not x.isdigit() and " users found this comment helpful." not in x) &
        data['review'].apply(is_meaningful_review)
    ]

    # Decode HTML entities in the review column
    cleaned_data['review'] = cleaned_data['review'].apply(html.unescape)

    return cleaned_data

In [142]:
raw_df_1 = clean_dataset("./application/data/drugsComTest_raw.tsv")
raw_df_2 = clean_dataset("./application/data/drugsComTrain_raw.tsv")
df = pd.concat([raw_df_1, raw_df_2], ignore_index=True)
data = pd.read_csv("./application/data/medicine_dataset.csv", index_col=0)

In [143]:
drugNames = list(df.drugName.apply(lambda x: x.lower()).unique())

In [144]:
classes = []
for col in ["Chemical Class", "Action Class", "Therapeutic Class"]:
    classes += list(data[col].apply(lambda x: x.lower() if str(x) != "nan" else "").unique())
drugs = []
for col in ["name", "substitute0", "substitute1", "substitute2", "substitute3", "substitute4"]:
    drugs += list(data[col].apply(lambda x: x.lower() if str(x) != "nan" else "").unique())

classes = list(set(classes))
drugs = list(set(drugs))

In [151]:
medsDataset = data[["Chemical Class", "Action Class", "Therapeutic Class"] + ["name", "substitute0", "substitute1", "substitute2", "substitute3", "substitute4"]]
for col in medsDataset:
    medsDataset[col] = medsDataset[col].apply(lambda x: x.lower() if str(x) != "nan" else "")
medsDataset = medsDataset.drop_duplicates().reset_index(drop=True)

In [152]:
drugDataset = pd.DataFrame(index=[i for i in range(len(drugNames))], columns=["drugName"])
drugDataset["drugName"] = drugNames

In [154]:
medsDataset.to_csv("./application/data/MedsData.csv")
drugDataset.to_csv("./application/data/DrugsData.csv")

In [173]:
medsDataset = medsDataset.reset_index()

In [180]:
drugs = drugDataset.drugName.values
meds = medsDataset.name.values
match, score, ind = process.extractOne(drugs[0], meds)

('migrazine 10mg tablet', 72.0, 129663)

In [181]:
drugs[0]

'mirtazapine'

In [187]:
temp = pd.DataFrame(index=drugs)
for col in ["name", "substitute0", "substitute1", "substitute2", "substitute3", "substitute4"] + ["Chemical Class", "Action Class", "Therapeutic Class"]:
    
    vals = medsDataset[col].values
    print(col)
    for drug in drugs:
        match, score, ind = process.extractOne(drug, vals)
        temp.loc[drug, f"{col}_match"] = match
        temp.loc[drug, f"{col}_score"] = score
        temp.loc[drug, f"{col}_ind"] = ind

name
substitute0
substitute1
substitute2
substitute3


KeyboardInterrupt: 

In [195]:
temp.loc[:,"maxScore"] = temp[["name_score", "substitute0_score", "substitute1_score", "substitute2_score", "substitute3_score"]].idxmax(axis=1)

In [199]:
temp["bestMatch"] = temp.apply(lambda x: x[x["maxScore"].split("_")[0] + "_match"], axis=1)

In [201]:
temp[["bestMatch"]].reset_index()

Unnamed: 0,index,bestMatch
0,mirtazapine,migrazine 10mg tablet
1,mesalamine,davaindia mesalazine 1200mg tablet pr
2,bactrim,bactrim ds tablet
3,contrave,contragesic tablet
4,cyclafem 1 / 35,azulix 1 mf forte tablet pr
...,...,...
3630,oxytocin,otocin-c ear drop
3631,iluvien,calciluvin syrup
3632,mavik,avikind cv 500mg/125mg tablet
3633,aldomet,aldome capsule


In [210]:
columns = lines[0].split("\t")

In [213]:
res = []
for line in lines[1:]:
    temp = {}
    for col,item in zip(columns, line.split("\t")):
        temp[col] = item
    res.append(temp)
res = pd.DataFrame(res)

In [219]:
res["drugName"] = res["DrugName"].apply(lambda x: x.lower())

In [228]:
existingDrugs = list(df["drugName"].apply(lambda x: x.lower()).unique())
newDrugs = list(res["drugName"].unique())

In [229]:
len(existingDrugs)

3635

In [231]:
len(list(set(existingDrugs).intersection(set(newDrugs))))

1914

In [235]:
res.drugName = 

Unnamed: 0,ApplNo,ProductNo,Form,Strength,ReferenceDrug,DrugName,ActiveIngredient,ReferenceStandard\n,drugName
0,000004,004,SOLUTION/DROPS;OPHTHALMIC,1%,0,PAREDRINE,HYDROXYAMPHETAMINE HYDROBROMIDE,0\n,paredrine
1,000159,001,TABLET;ORAL,500MG,0,SULFAPYRIDINE,SULFAPYRIDINE,0\n,sulfapyridine
2,000552,001,INJECTABLE;INJECTION,"20,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0\n,liquaemin sodium
3,000552,002,INJECTABLE;INJECTION,"40,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0\n,liquaemin sodium
4,000552,003,INJECTABLE;INJECTION,"5,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0\n,liquaemin sodium
...,...,...,...,...,...,...,...,...,...
47343,761354,001,INJECTABLE;INJECTION,80MG/4ML,0,TOFIDENCE,TOCILIZUMAB-BAVI,0\n,tofidence
47344,761355,001,INJECTABLE;INJECTION,8MG(0.07ML;114.3MG/ML),0,EYLEA HD,AFLIBERCEPT,0\n,eylea hd
47345,761358,001,INJECTABLE;INJECTION,120MG/ML,0,ZYMFENTRA,INFLIXIMAB-DYYB,0\n,zymfentra
47346,761362,001,INJECTABLE;SUBCUTANEOUS,60MG/ML,0,JUBBONTI,DENOSUMAB-BBDZ,0\n,jubbonti
