This is the first code for running in AWS

In [1]:
import urllib.request,io
import requests
import urllib
import json
import sys
import zipfile
import shutil
import timeit
import os
import pickle

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Make dirs
if not os.path.exists('data'):
    os.makedirs('data')
    
if not os.path.exists('progress'):
    os.makedirs('progress')
    
if not os.path.exists('progress/wrangled'):
    os.makedirs('progress/wrangled')

if not os.path.exists('progress/modeling'):
    os.makedirs('progress/modeling')
    
if not os.path.exists('progress/wrangled/percent_1'):
    os.makedirs('progress/wrangled/percent_1')

## 1. Downloading the Data

In [4]:
# Archive of downloadable links from FDA
# Data are available for download at this site: https://open.fda.gov/tools/downloads/

# For a cursory look at the data, see this link
# https://open.fda.gov/apis/drug/event/explore-the-api-with-an-interactive-chart/

# For information on entries see this link
# https://open.fda.gov/apis/drug/event/searchable-fields

jsonurl = 'https://api.fda.gov/download.json'
with urllib.request.urlopen(jsonurl) as url:
    data = json.loads(url.read().decode())

In [5]:
# Disclaimer
data['meta']['disclaimer']

'Do not rely on openFDA to make decisions regarding medical care. While we make every effort to ensure that data is accurate, you should assume all results are unvalidated. We may limit or otherwise restrict your access to the API in line with our Terms of Service.'

In [6]:
# Entries of downloadable files
# 760 in total

# Index key:
# 2004 Q1: 4:8
# 2004 Q2: 8:13
# 2004 Q3: 13:18
# 2004 Q4: 18:23
# 2005 Q1: 23:28
# 2005 Q2: 28:33
# 2005 Q3: 33:38
# 2005 Q4: 38:43
# 2006 Q1: 43:49
# 2006 Q2: 49:54
# 2006 Q3: 54:59
# 2006 Q4: 59:64
# 2007 Q1: 64:70
# 2007 Q2: 70:76
# 2007 Q3: 76:83
# 2007 Q4: 83:89
# 2008 Q1: 89:97
# 2008 Q2: 97:104
# 2008 Q3: 104:111
# 2008 Q4: 111:117
# 2009 Q1: 117:124
# 2009 Q2: 124:132
# 2009 Q3: 132:140
# 2009 Q4: 140:147
# 2010 Q1: 147:156
# 2010 Q2: 156:165
# 2010 Q3: 165:175
# 2010 Q4: 175:185
# 2011 Q1: 185:196
# 2011 Q2: 196:208
# 2011 Q3: 208:219
# 2011 Q4: 219:229
# 2012 Q1: 229:238
# 2012 Q2: 238:249
# 2012 Q3: 249:256
# 2012 Q4: 256:271
# 2013 Q1: 271:289
# 2013 Q2: 289:303
# 2013 Q3: 303:319
# 2013 Q4: 319:338
# 2014 Q1: 338:358
# 2014 Q2: 358:374
# 2014 Q3: 374:391
# 2014 Q4: 391:408
# 2015 Q1: 408:433
# 2015 Q2: 433:456
# 2015 Q3: 456:488
# 2015 Q4: 488:513
# 2016 Q1: 513:542
# 2016 Q2: 542:566
# 2016 Q3: 566:590
# 2016 Q4: 590:613
# 2017 Q1: 613:640
# 2017 Q2: 640:666
# 2017 Q3: 666:693
# 2017 Q4: 693:720
# 2018 Q1: 720:753
# 2018 Q2: 753:791

partitions = data['results']['drug']['event']['partitions']
# Display a few partitions
partitions[271:]

[{'size_mb': '1.90',
  'records': 12000,
  'display_name': '2013 Q1 (part 1 of 18)',
  'file': 'https://download.open.fda.gov/drug/event/2013q1/drug-event-0001-of-0018.json.zip'},
 {'size_mb': '1.68',
  'records': 12000,
  'display_name': '2013 Q1 (part 2 of 18)',
  'file': 'https://download.open.fda.gov/drug/event/2013q1/drug-event-0002-of-0018.json.zip'},
 {'size_mb': '1.69',
  'records': 12000,
  'display_name': '2013 Q1 (part 3 of 18)',
  'file': 'https://download.open.fda.gov/drug/event/2013q1/drug-event-0003-of-0018.json.zip'},
 {'size_mb': '2.92',
  'records': 12000,
  'display_name': '2013 Q1 (part 4 of 18)',
  'file': 'https://download.open.fda.gov/drug/event/2013q1/drug-event-0004-of-0018.json.zip'},
 {'size_mb': '2.34',
  'records': 12000,
  'display_name': '2013 Q1 (part 5 of 18)',
  'file': 'https://download.open.fda.gov/drug/event/2013q1/drug-event-0005-of-0018.json.zip'},
 {'size_mb': '3.51',
  'records': 12000,
  'display_name': '2013 Q1 (part 6 of 18)',
  'file': 'http

In [7]:
total_records = 0
total_size = 0
urls = []
partitions_slice = partitions
for entry in partitions_slice:
    urls.append(entry['file'])
    total_records = total_records + entry['records']
    total_size = total_size + float(entry['size_mb'])
print('{records:0.2f} million records for a total size of {size:0.2f} GB'.format(records = total_records/10**6, size = total_size/1000))

9.14 million records for a total size of 43.07 GB


In [8]:
# Save compressed files in data folder
file_name = []
j = 0
#for i in range(0,len(urls)): # This goes through all the data
print('Progress...')
for i in range(601,602):
    start_time = timeit.default_timer()
    print('File {num} with size = {size} MB downloading...'.format(num = i, size = partitions[i]['size_mb']))
    url = urls[i] # Get the url
    # Define the filename for the data folder
    file_name.append('./data/' + url.split('/')[-2] + '_' + url.split('/')[-1]) 
    # Download the data and save it to the data folder
    with urllib.request.urlopen(url) as response, open(file_name[j], 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
    j += 1
    elapsed = timeit.default_timer() - start_time
    print('Completed in {time:0.2f} seconds'.format(time = elapsed))
print('Done.')

Progress...
File 601 with size = 8.62 MB downloading...
Completed in 1.00 seconds
Done.


In [9]:
# Unzip the data
data_2 = []
data_inter = None  
j = 0
# Run through sample
print('Progress...')
for j in range(0,len(file_name)):
    start_time = timeit.default_timer()
    with zipfile.ZipFile((file_name[j]), "r") as z: # get the .zip file
        for filename in z.namelist():  
            with z.open(filename) as f:  # Open the file
                print('File {iterable} of {total} loading...'.format(iterable = j+1, total = len(file_name)))  # print a progress check
                data_inter = f.read()   # read the file
                data_2.append(json.loads(data_inter.decode("utf-8"))) # decode and save
                elapsed = timeit.default_timer() - start_time
                print('{num_records} records loaded in {time:0.2f} seconds.'.format(num_records = len(data_2[j]['results']), time= elapsed))
print('Done.')        

Progress...
File 1 of 1 loading...
12000 records loaded in 1.14 seconds.
Done.


In [10]:
# File information
type_file = type(data_2)
type_entry = type(data_2[0]['results'][0])

size = 0
for i in range(0, len(data_2)):
    for j in range(0, len(data_2[i]['results'])):
        size = size + sys.getsizeof(data_2[i]['results'][j])
print('data_2 is a {} where entries of data_2[0][\'results\'][0] are {}'.format(type_file, type_entry))
print('Approximate Size of packed file is {size:.2f} MB'.format(size = size/10**6))

data_2 is a <class 'list'> where entries of data_2[0]['results'][0] are <class 'dict'>
Approximate Size of packed file is 8.81 MB


**Save**

In [11]:
# General, filenames used in a general set of records
#with open("./progress/file_name.txt", "wb") as fp:   #Pickling
#    pickle.dump(file_name, fp)

In [12]:
# General 
#with open("./progress/data_2.txt", "wb") as fp:   #Pickling
#    pickle.dump(data_2, fp)

In [13]:
#with open("./progress/file_name_sampleforunique.txt", "wb") as fp:   #Pickling
#    pickle.dump(file_name, fp)

In [14]:
#with open("./progress/data_sampleforunique.txt", "wb") as fp:   #Pickling
#    pickle.dump(data_2, fp)

**Load**

In [15]:
# General
#with open("./progress/file_name.txt", "rb") as fp:   # Unpickling
#    file_name = pickle.load(fp)

In [16]:
# General
#with open("./progress/data_2.txt", "rb") as fp:   # Unpickling
#    data_2 = pickle.load(fp)

In [17]:
# 2008
#with open("./progress/file_name_2008.txt", "rb") as fp:   # Unpickling
#    file_name = pickle.load(fp)

In [18]:
# 2008
#with open("./progress/data_2008.txt", "rb") as fp:   # Unpickling
#    data_2 = pickle.load(fp)

**File Information**

In [19]:
type_file = type(data_2)
type_entry = type(data_2[0]['results'][0])
txt_size = os.path.getsize('./progress/data_2.txt')

size = 0
for i in range(0, len(data_2)):
    for j in range(0, len(data_2[i]['results'])):
        size = size + sys.getsizeof(data_2[i]['results'][j])
print('Number of files = {}'.format(len(file_name)))
print('data_2 is a {} where entries of data_2[0][\'results\'][0] are {}'.format(type_file, type_entry))
print('Approximate Size of packed file is {size:.2f} MB'.format(size = size/10**6))
print('Total Size of stored txt file is {size:.2f} MB'.format(size = txt_size/10**6))

Number of files = 1
data_2 is a <class 'list'> where entries of data_2[0]['results'][0] are <class 'dict'>
Approximate Size of packed file is 8.81 MB
Total Size of stored txt file is 1054.11 MB


**Select the files to save**

In [20]:
model_num = ['1_1', '1_2', '1_3']
model_num = model_num[0]
save_all = False # save all the relevant files
save_df = False
save_entries = False
save_entries_len = False
save_unique_gn = False
save_unique_di = False

## 2. Convert to a DataFrame

In [21]:
from helper_funcs.munging_func import section_1, section_2, section_3, section_4, section_5, section_6, section_7

In [22]:
df_1 = section_1(data_2, file_name)
df_2 = section_2(data_2)
df_3 = section_3(data_2)
df_4 = section_4(data_2)
df_5 = section_5(data_2)
df_6 = section_6(data_2)
df_7 = section_7(data_2)

Section 1, file 1 of 1...
15.09 ms to complete
Converting lists to dataframe...
507.80 ms to complete.

Section 2, file 1 of 1...
159.74 ms to complete
Converting lists to dataframe...
584.76 ms to complete.

Section 3, file 1 of 1...
437.04 ms to complete
Converting lists to dataframe...
567.50 ms to complete.

Section 4, file 1 of 1...
18.15 ms to complete
Converting lists to dataframe...
495.98 ms to complete.

Section 5, file 1 of 1...
51.87 ms to complete
Converting lists to dataframe...
513.14 ms to complete.

Section 6, file 1 of 1...
42.56 ms to complete
Converting lists to dataframe...
552.12 ms to complete.

Section 7, file 1 of 1...
30.80 ms to complete
Converting lists to dataframe...
633.44 ms to complete.



In [23]:
df = df_1.join(df_2).join(df_3).join(df_4).join(df_5).join(df_6).join(df_7)

In [24]:
df.shape

(12000, 88)

## 3. Customize DataFrame

In [25]:
# Drop unnecessary columns
df = df[['patient_onset_age', 'patient_weight', 'patient_sex', 'generic_name', 'drug_char', 
         'drug_indication', 'admin_route', 'reaction_medDRA_pt', 'serious', 'seriousness_congential_anomali', 
         'seriousness_death', 'seriousness_disabling', 'seriousness_hospitalization', 
         'seriousness_lifethreatening', 'seriousness_other']]

In [26]:
df.head()

Unnamed: 0,patient_onset_age,patient_weight,patient_sex,generic_name,drug_char,drug_indication,admin_route,reaction_medDRA_pt,serious,seriousness_congential_anomali,seriousness_death,seriousness_disabling,seriousness_hospitalization,seriousness_lifethreatening,seriousness_other
0,,,2.0,_.._NILOTINIB,_1,_CHRONIC MYELOID LEUKAEMIA,_048,_._Nasopharyngitis,2,0,0,0,0,0,0
1,,,1.0,_.._IMATINIB MESYLATE,_1,_PRODUCT USED FOR UNKNOWN INDICATION,_065,_._Regurgitation_._Feeling abnormal,2,0,0,0,0,0,0
2,36.0,,2.0,_.._MEPERIDINE HYDROCHLORIDE,_1,,,_._Muscle twitching_._Drug hypersensitivity,2,0,0,0,0,0,0
3,,,2.0,_.._NILOTINIB,_1,_PRODUCT USED FOR UNKNOWN INDICATION,_048,_._Death,1,0,1,0,0,0,0
4,,,,_.._LEVOTHYROXINE SODIUM_.__.._VENLAFAXINE_.._...,_2_.__2_.__1_._,_HYPOTHYROIDISM_.__DEPRESSION_.__PAIN MANAGEME...,_065_.__048_.__062_._,_._Product adhesion issue_._Wrong technique in...,2,0,0,0,0,0,0


In [27]:
# Check percent of missing values
df.replace('NA', np.nan).isna().sum()/len(df)

patient_onset_age                 0.508583
patient_weight                    0.875333
patient_sex                       0.113083
generic_name                      0.002333
drug_char                         0.000000
drug_indication                   0.016000
admin_route                       0.023583
reaction_medDRA_pt                0.000000
serious                           0.000000
seriousness_congential_anomali    0.000000
seriousness_death                 0.000000
seriousness_disabling             0.000000
seriousness_hospitalization       0.000000
seriousness_lifethreatening       0.000000
seriousness_other                 0.000000
dtype: float64

**NA comments**

- Lots of NA in the patient characteristics (Age, Weight, sex). Will likely have to run separate models for each one.
- If generic_name is gone, the row has to go. No use with it. 
- If drug_indication is NA, we can fill the one-hot-encoders with zeros. Just no reason to take the drug!

In [28]:
# Drop missing generic names
df['generic_name'] = df['generic_name'].replace('NA', np.nan)
df = df.dropna(axis=0).reset_index()
df.shape

(11972, 16)

## 4. Feature Engineering

In [29]:
from helper_funcs.wrangling_func import unique_gen, entries_col_to_lists, value_gen, data_wrangling, percent_gen
from helper_funcs.wrangling_func import one_hot_encode_drugs, entry_condenser
from helper_funcs.conversion_dicts import quals, patientsex, admin_route_dict, drug_char_dict, unii_dict
from helper_funcs.conversion_dicts import dict_LLT_PT, dict_PT_HLT, dict_HLT_HLGT, dict_HLGT_SOC

  from collections import Sequence


In [30]:
df_ML = pd.DataFrame()

In [31]:
# List for the length of all the entry types
entries_len = []

**generic_name**

In [32]:
# One-hot-encode with raw input (~2000 drugs)
# df_tmp = one_hot_encode_drugs('generic_name', df)
# df_tmp.head()

In [33]:
test_col = 'generic_name'

### One-hot-encode with condensed input
# Generate list of unique drugs
unique_entries = unique_gen(test_col, df)
# Condense this list to a lower dimensional space
cond_dict = entry_condenser(unique_entries, 25)
# Get raw columns as a nested list
col_as_list = entries_col_to_lists(test_col, df)
# Convert raw column to lower-d space
col_list_cond = [[[cond_dict[l1] for l1 in l2 if l1 in cond_dict.keys()] for l2 in l3] for l3 in col_as_list]
# Unnest the lowest list
col_list_cond = [[x for y in entry for x in y] for entry in col_list_cond]

# One hot encoder
# For more information, see the following link:
# https://stackoverflow.com/questions/46864816/convert-data-frame-of-comma-separated-strings-to-one-hot-encoded
print('One Hot Encoding...')
df_tmp = pd.Series(col_list_cond).str.join(',').str.split(',',expand = True).apply(pd.Series.value_counts, 1).iloc[:,:].fillna(0)
print('Done.')
df_tmp.head()

Converting series to list...
Removing commas...
Converting to list of lists
One Hot Encoding...
Done.


Unnamed: 0,Unnamed: 1,5%,ABIRATERONE ACETATE,ACETAMINOPHEN AND DIPHENH,AFATINIB,ALENDRONATE SODIUM,ALTEPLASE,ALUMINUM HYDROXIDE MAGNES,AMLODIPINE BESYLATE AND A,ARIPIPRAZOLE,...,PRAZOSIN HYDROCHLORIDE,PYRITHIONE ZINC,SITAGLIPTIN,SODIUM CITRATE,SODIUM FLUORIDE,SOTALOL HYDROCHLORIDE,SUMATRIPTAN SUCCINATE,THALIDOMIDE,TIOTROPIUM BROMIDE,VITAMIN
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Save unique list of generic_names
unique_entries_generic_name = unique_entries

In [35]:
# clean up
df_tmp = df_tmp.drop(['5%'], axis=1)
df_tmp = df_tmp.drop([''], axis = 1)

In [36]:
# start the final df
entries_len.append(len(df_tmp.columns))
df_ML = df_tmp.copy()

In [37]:
df_ML.head()

Unnamed: 0,ABIRATERONE ACETATE,ACETAMINOPHEN AND DIPHENH,AFATINIB,ALENDRONATE SODIUM,ALTEPLASE,ALUMINUM HYDROXIDE MAGNES,AMLODIPINE BESYLATE AND A,ARIPIPRAZOLE,ATROPINE,ATROPINE SULFATE,...,PRAZOSIN HYDROCHLORIDE,PYRITHIONE ZINC,SITAGLIPTIN,SODIUM CITRATE,SODIUM FLUORIDE,SOTALOL HYDROCHLORIDE,SUMATRIPTAN SUCCINATE,THALIDOMIDE,TIOTROPIUM BROMIDE,VITAMIN
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**drug_char**

In [38]:
# one-hot-encode
df_tmp = one_hot_encode_drugs('drug_char', df)
df_tmp.head()

Converting series to list...
Removing commas...
Converting to list of lists
One Hot Encoding...
Done.


Unnamed: 0,Concominant,Interacting,Suspect,Unknown
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,2.0,0.0,1.0,1.0


In [39]:
# clean up
df_tmp = df_tmp.drop(['Unknown'], axis = 1)

In [40]:
# join to the rest
entries_len.append(len(df_tmp.columns))
df_ML = pd.concat([df_ML, df_tmp], axis=1, join_axes=[df_ML.index])

In [41]:
df_ML.head()

Unnamed: 0,ABIRATERONE ACETATE,ACETAMINOPHEN AND DIPHENH,AFATINIB,ALENDRONATE SODIUM,ALTEPLASE,ALUMINUM HYDROXIDE MAGNES,AMLODIPINE BESYLATE AND A,ARIPIPRAZOLE,ATROPINE,ATROPINE SULFATE,...,SODIUM CITRATE,SODIUM FLUORIDE,SOTALOL HYDROCHLORIDE,SUMATRIPTAN SUCCINATE,THALIDOMIDE,TIOTROPIUM BROMIDE,VITAMIN,Concominant,Interacting,Suspect
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0


**drug_indication**

In [42]:
# All drug_indications (~7000)
# df_tmp = one_hot_encode_drugs('drug_indication', df)
# df_tmp.head()

In [43]:
test_col = 'drug_indication'

### One-hot-encode with condensed input
# Generate list of unique drugs
unique_entries = unique_gen(test_col, df)
# Condense this list to a lower dimensional space
cond_dict = entry_condenser(unique_entries, 25)
# Get raw columns as a nested list
col_as_list = entries_col_to_lists(test_col, df)
# Convert raw column to lower-d space
col_list_cond = [[[cond_dict[l1] for l1 in l2 if l1 in cond_dict.keys()] for l2 in l3] for l3 in col_as_list]
# Unnest the lowest list
col_list_cond = [[x for y in entry for x in y] for entry in col_list_cond]

# One hot encoder
# For more information, see the following link:
# https://stackoverflow.com/questions/46864816/convert-data-frame-of-comma-separated-strings-to-one-hot-encoded
print('One Hot Encoding...')
df_tmp = pd.Series(col_list_cond).str.join(',').str.split(',',expand = True).apply(pd.Series.value_counts, 1).iloc[:,:].fillna(0)
print('Done.')
df_tmp.head()

Converting series to list...
Removing commas...
Converting to list of lists
One Hot Encoding...
Done.


Unnamed: 0,Unnamed: 1,ACNE,AGITATION,ANAEMIA,ANTIINFLAMMATORY THERAPY,ANTIVIRAL PROPHYLAXIS,ASSISTED FERTILISATION,AUTOIMMUNE HEPATITIS,BACTERIAL INFECTION,BIPOLAR I DISORDER,...,STENT PLACEMENT,SWELLING,THROMBOCYTOPENIA,THYROID CANCER,TOBACCO USER,URGE INCONTINENCE,VASCULAR STENT RESTENOSIS,VIRAL INFECTION,VITAMIN SUPPLEMENTATION,WHITE BLOOD CELL COUNT
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Save unique list of drug indication
unique_entries_drug_indication = unique_entries

In [45]:
# clean up
df_tmp = df_tmp.drop([''], axis = 1)

In [46]:
# join to the rest
entries_len.append(len(df_tmp.columns))
df_ML = pd.concat([df_ML, df_tmp], axis=1, join_axes=[df_ML.index])

In [47]:
df_ML.head()

Unnamed: 0,ABIRATERONE ACETATE,ACETAMINOPHEN AND DIPHENH,AFATINIB,ALENDRONATE SODIUM,ALTEPLASE,ALUMINUM HYDROXIDE MAGNES,AMLODIPINE BESYLATE AND A,ARIPIPRAZOLE,ATROPINE,ATROPINE SULFATE,...,STENT PLACEMENT,SWELLING,THROMBOCYTOPENIA,THYROID CANCER,TOBACCO USER,URGE INCONTINENCE,VASCULAR STENT RESTENOSIS,VIRAL INFECTION,VITAMIN SUPPLEMENTATION,WHITE BLOOD CELL COUNT
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**admin_route**

In [48]:
# one-hot-encode
df_tmp = one_hot_encode_drugs('admin_route', df)
df_tmp.head()

Converting series to list...
Removing commas...
Converting to list of lists
One Hot Encoding...
Done.


Unnamed: 0,Buccal,Cutaneous,Endocervical,Epidural,Intra-uterine,Intradermal,Intramuscular,Intraocular,Intraperitoneal,Intrathecal,...,Subcutaneous,Subdermal,Sublingual,Sunconjunctival,Topical,Transdermal,Transmammary,Transplacental,Unknown,Vaginal
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [49]:
# Clean up
df_tmp = df_tmp.drop(['Unknown'], axis = 1)
df_tmo = df_tmp.drop(['Not Listed'], axis = 1)

In [50]:
# join to the rest
entries_len.append(len(df_tmp.columns))
df_ML = pd.concat([df_ML, df_tmp], axis=1, join_axes=[df_ML.index])

In [51]:
df_ML.head()

Unnamed: 0,ABIRATERONE ACETATE,ACETAMINOPHEN AND DIPHENH,AFATINIB,ALENDRONATE SODIUM,ALTEPLASE,ALUMINUM HYDROXIDE MAGNES,AMLODIPINE BESYLATE AND A,ARIPIPRAZOLE,ATROPINE,ATROPINE SULFATE,...,Respiratory (inhalation),Subcutaneous,Subdermal,Sublingual,Sunconjunctival,Topical,Transdermal,Transmammary,Transplacental,Vaginal
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


**reaction_medDRA_pt**

In [57]:
model_num = "X_3"

In [58]:
test_col = 'reaction_medDRA_pt'
print('Working on "{}"...'.format(test_col))
start_time = timeit.default_timer()

### One-hot-encode with condensed input
# Generate list of unique drugs
unique_entries = unique_gen(test_col, df)
# Condense this list to a lower dimensional space
cond_dict = entry_condenser(unique_entries, 25)
# Get raw columns as a nested list
col_as_list = entries_col_to_lists(test_col, df)
# Convert raw column to lower-d space
col_list_cond = [[[cond_dict[l1] for l1 in l2 if l1 in cond_dict.keys()] for l2 in l3] for l3 in col_as_list]
# Unnest the lowest list
col_list_cond = [[x for y in entry for x in y] for entry in col_list_cond]

# One hot encoder
# For more information, see the following link:
# https://stackoverflow.com/questions/46864816/convert-data-frame-of-comma-separated-strings-to-one-hot-encoded
print('One Hot Encoding...')
if model_num == "X_1":
    df_tmp = pd.Series(col_list_cond).str.join(',').str.split(',',expand = True).apply(pd.Series.value_counts, 1).iloc[:,:].fillna(0)

if model_num == "X_3":
    col_as_list = [[x for y in entry for x in y] for entry in col_as_list]
    df_tmp = pd.Series(col_as_list).str.join(',').str.split(',',expand = True).apply(pd.Series.value_counts, 1).iloc[:,:].fillna(0)

print('Done.')
df_tmp.head()

Converting series to list...
Removing commas...
Converting to list of lists
One Hot Encoding...
Done.


Unnamed: 0,Abasia,Abdominal discomfort,Abdominal distension,Abdominal hernia,Abdominal infection,Abdominal mass,Abdominal neoplasm,Abdominal pain,Abdominal pain lower,Abdominal pain upper,...,Wound infection,Wound secretion,Wrist fracture,Wrong device used,Wrong drug administered,Wrong patient received medication,Wrong technique in product usage process,Yawning,Yellow skin,Zinc deficiency
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [55]:
# clean up
df_tmp = df_tmp.drop([''], axis = 1)

In [56]:
# joing to the rest
entries_len.append(len(df_tmp.columns))
df_ML = pd.concat([df_ML, df_tmp], axis=1, join_axes=[df_ML.index])

In [57]:
df_ML.head()

Unnamed: 0,ABIRATERONE ACETATE,AFATINIB,ALENDRONATE SODIUM,ALTEPLASE,ALUMINUM HYDROXIDE MAGNES,AMLODIPINE BESYLATE AND A,ARIPIPRAZOLE,ATROPINE,ATROPINE SULFATE,BOSENTAN,...,Respiratory (inhalation),Subcutaneous,Subdermal,Sublingual,Sunconjunctival,Topical,Transdermal,Transmammary,Transplacental,Vaginal
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 5. Save the dfs for Modeling and SQL

In [54]:
df_ML.shape

(69557, 1786)

In [55]:
# Add in the outcome variables
df_outcomes = df[['serious', 'seriousness_congential_anomali', 
         'seriousness_death', 'seriousness_disabling', 'seriousness_hospitalization', 
         'seriousness_lifethreatening', 'seriousness_other']]

df_ML = pd.concat([df_ML, df_outcomes], axis = 1, join_axes = [df_ML.index])

In [None]:
# Convert all columns to numeric
df_ML = df_ML.apply(pd.to_numeric)
df_ML['serious'] = df_ML['serious'] - 1

In [None]:
# Visualize the sparsity of the matrix
import matplotlib.pyplot as plt
fig, axs = plt.subplots(figsize = (8,8))
axs.spy(df_ML.head(500), markersize=1)
plt.show()

In [None]:
# Save as a csv
if save_df or save_all:
    df_ML_filename = './progress/modeling/df_ML_model_' + model_num + '.csv'
    df_ML.to_csv(df_ML_filename)

In [None]:
# save the entries
if save_entries or save_all:
    entries_filename = './progress/modeling/entries_' + model_num + '.txt'
    with open(entries_filename, "wb") as fp:   #Pickling
        pickle.dump(entries, fp)

In [None]:
# save the length of entries sections
if save_entries_len or save_all:
    entires_len_filename = "./progress/modeling/entries_len_" + model_num + ".txt"
    with open(entires_len_filename, "wb") as fp:   #Pickling
        pickle.dump(entries_len, fp)

In [None]:
# save the unique entries for generic_name
if save_unique_gn or save_all:
    with open("./progress/modeling/unique_generic_name.txt", "wb") as fp:
        pickle.dump(unique_entries_generic_name, fp)

In [None]:
# save the unique entries for drug_indication
if save_unique_di or save_all:
    with open("./progress/modeling/unique_drug_indication.txt", "wb") as fp:
        pickle.dump(unique_entries_drug_indication, fp)