# Drug Reviews

In [19]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import zipfile
import string
import random
import json

# Paths and Variables

In [20]:
dataset_name = 'drug_reviews'

In [21]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

In [22]:
inp_fname_train = 'drugLibTrain_raw.tsv'
inp_fname_test = 'drugLibTest_raw.tsv'

# Read data into a DataFrame

In [23]:
data1 = pd.read_csv(os.path.join(input_dir, inp_fname_train), sep="\t")
data2 = pd.read_csv(os.path.join(input_dir, inp_fname_test), sep="\t")
data = pd.concat([data1, data2], axis=0, ignore_index=True)
print(data.shape)
data.head()

(4143, 9)


Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


# Prepare Data

In [24]:
id_col = "id"
target_col = "effectiveness"
text_col = "text"

In [25]:
data[target_col].value_counts()

Highly Effective          1741
Considerably Effective    1238
Moderately Effective       572
Ineffective                329
Marginally Effective       263
Name: effectiveness, dtype: int64

In [26]:
data.columns = [id_col] + list(data.columns[1:])
data.head()

Unnamed: 0,id,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [27]:
# Use text concatenated from 3 fields: benefitsReview, sideEffectsReview, commentsReview
data[text_col] = data.apply(
    lambda row: str(row['benefitsReview']) + ' ' + str(row['sideEffectsReview']) + ' ' + str(row['commentsReview']), 
                             axis = 1)

In [28]:
data = data[[id_col, target_col, text_col]]
data.head()

Unnamed: 0,id,effectiveness,text
0,2202,Highly Effective,slowed the progression of left ventricular dys...
1,3117,Highly Effective,Although this type of birth control has more c...
2,1146,Highly Effective,I was used to having cramps so badly that they...
3,3947,Marginally Effective,The acid reflux went away for a few months aft...
4,1951,Marginally Effective,I think that the Lyrica was starting to help w...


# Shuffle Data

In [29]:
# shuffle data
data = data.sample(frac=1, random_state=42)
print(data.shape)
data.head()

(4143, 3)


Unnamed: 0,id,effectiveness,text
2351,2260,Marginally Effective,"Initially, Lexapro helped alleviate some of th..."
1966,2324,Highly Effective,Started taking this drug for two reasons - per...
1582,2858,Highly Effective,short term relief from breathing problems. esp...
296,3456,Highly Effective,'Was diagnosed with H1N1 and was ill for at li...
149,4075,Ineffective,No impact whatsoever. It was like I wasn't ev...


# Insert Id Column

In [30]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Utility to Save DF as a zipped file

In [31]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [32]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [33]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)

(3728, 3) (415, 3)


In [34]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")

# JSON inference request instance

In [35]:
instance = data_test.reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
print(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'id': 2725, 'text': 'Sleep very well, at least 8 hours unless something wakes me up. Hiatus hernia not there on last endoscopy. major digestion problems gone. some slight acid reflux/indidgestion if eating big/heavy/rich meal. otherwise excellent.\r\r\nNot happy with side effects if miss dose or try to come off-very dizzy, sick, hallucinating etc. so have to take without fail before bed every night. slightly dry mouth. heavy sleep but good. 20 mg a day at night.'}]}
