# Drug Recommender System

### DATA PREPROCESSING

In [19]:
#import required packages
import pandas as pd
import numpy as np
import utils
import json

In [20]:
def condition_filter(condition):
    if type(condition) != str: return True
    if 'comment' in condition: return True
    if condition == 'Other': return True
    return False

def drug_filter(drug):
    return False

In [21]:
uc1_fname = "Data/drugsComTrain_raw.csv"
uc2_fname = "Data/drugsComTest_raw.csv"
wb_fname = "Data/webmd.csv"
uc1 = pd.read_csv(uc1_fname)
uc2 = pd.read_csv(uc2_fname)
wb = pd.read_csv(wb_fname)

condition_set = set()
for condition_dataset in [uc1['condition']]:
    for condition in condition_dataset:
        if condition_filter(condition):continue
        condition_set.add(condition)
c2id = {} 
for condition in condition_set:
    c2id[condition] = len(c2id)
json.dump(c2id, open('Data/c2id.json','w'))

drug_set = set()
for drug_dataset in [uc1['drugName']]:
    for drug in drug_dataset:
        if drug_filter(drug):continue
        drug_set.add(drug)
d2id = {} 
for drug in drug_set:
    d2id[drug] = len(d2id)
json.dump(d2id, open('Data/d2id.json','w'))

### RUNNING RECOMMENDER SYSTEM

In [None]:
np.random.seed(0)

class RS:
    POS_THD = 5.5
    def __init__(self, N_condition, N_drug, range_of_compute = 10):
        self.N_drug = N_drug
        self.N_condition = N_condition
        self.rating = np.zeros((N_condition, N_drug))
        self.range_of_compute = range_of_compute
        self.condition2id = None
        self.drug2id = None
    
    def save_dict(self, condition2id_dict, drug2id_dict):
        self.condition2id = condition2id_dict
        self.drug2id = drug2id_dict

    def train(self, train_data):
        count = np.zeros((self.N_condition, self.N_drug))
        for k in range(len(train_data)):
            i,j,r,c = train_data[k]
            c = 1
            self.rating[i][j] += (r-self.POS_THD) * c
            count[i][j] += c

    def eval(self, test_data):
        total_score = 0
        model_score = 0
        for k in range(len(test_data)):
            i,j,r,c = test_data[k]
            c = 1
            total_score += c
            posc,negc = self.recommend(i)
            if r >= self.POS_THD:
                if j in posc:
                    model_score += c
            else:
                if j in negc:
                    model_score += c
        return model_score / total_score

    def recommend(self, condition):
        condition_id = condition
        array = self.rating[condition_id,:]
        order = np.argsort(-array)
        pos_choices = order[0:self.range_of_compute]
        neg_choices = order[-self.range_of_compute-1:-1]
        return [pos_choices, neg_choices]

def read_data_csv(dataset_fname):
    df = pd.DataFrame(pd.read_csv(dataset_fname))
    condition_name = df['condition'].to_numpy()
    drug_name = df['drugName'].to_numpy()
    rating = df['rating'].to_numpy()
    usefulCount = df['usefulCount'].to_numpy()


    return [condition_name, drug_name, rating, usefulCount]

def build_dataset(data_fname, c2id, d2id):
    c,d,r,u = read_data_csv(data_fname)
    dataset = []
    n = 0
    for i in range(len(c)):
        if c[i] not in c2id or d[i] not in d2id:
            n += 1
            continue
        dataset.append([c2id[c[i]], d2id[d[i]], r[i], u[i]])
    dataset = np.array(dataset)
    print(f"Discard {n} samples in dataset {data_fname}")
    return dataset

def case_study(c2id, d2id, train_dataset, test_dataseti, rs):
    for cname in ['Post Traumatic Stress Disorde', 'Birth Control',\
                  'Depression', 'Smoking Cessation',\
                  'High Blood Pressure','Acne', 'Anxiety', 'Pain']:
        i = c2id[cname]
        re_list,_ = rs.recommend(i)
        dnames = []
        for t in re_list:
            for k in d2id:
                if d2id[k] == t: dnames.append(k)
        print("Condition: {}, recommend drugs: {}\n".format(cname, dnames))

In [23]:
train_fname = "Data/drugsComTrain_raw.csv"
test_fname = "Data/drugsComTest_raw.csv"
c2id = json.load(open("Data/c2id.json"))
d2id = json.load(open("Data/d2id.json"))
train_dataset = build_dataset(train_fname, c2id, d2id)
test_dataset = build_dataset(test_fname, c2id, d2id)

rs = RS(len(c2id), len(d2id))
rs.train(train_dataset)
performance = rs.eval(test_dataset)
print(performance * 100)

case_study(c2id, d2id, train_dataset, test_dataset, rs)

Discard 1799 samples in dataset Data/drugsComTrain_raw.csv
Discard 869 samples in dataset Data/drugsComTest_raw.csv
49.53210957143127
Condition: Post Traumatic Stress Disorde, recommend drugs: ['Zoloft', 'Sertraline', 'Prazosin', 'Quetiapine', 'Asenapine', 'Desvenlafaxine', 'Lamotrigine', 'Pristiq', 'Saphris', 'Paroxetine']

Condition: Birth Control, recommend drugs: ['Levonorgestrel', 'Skyla', 'Mirena', 'Copper', 'Etonogestrel', 'NuvaRing', 'Ethinyl estradiol / etonogestrel', 'Ethinyl estradiol / norgestimate', 'Ethinyl estradiol / levonorgestrel', 'Implanon']

Condition: Depression, recommend drugs: ['Bupropion', 'Escitalopram', 'Citalopram', 'Sertraline', 'Desvenlafaxine', 'Pristiq', 'Lexapro', 'Fluoxetine', 'Zoloft', 'Venlafaxine']

Condition: Smoking Cessation, recommend drugs: ['Varenicline', 'Chantix', 'Bupropion', 'Nicotine', 'Zyban', 'Nicoderm CQ', 'Commit', 'Nicotrol Inhaler', 'Nicorette', 'Habitrol']

Condition: High Blood Pressure, recommend drugs: ['Olmesartan', 'Nebivolol