In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
import os
from functools import reduce

## Load data

In [2]:
name_data = os.path.expanduser('~/Dropbox/Workspace/projects/drug-name-generator/fda-drug-data/Products.txt')

In [3]:
data = pd.read_csv(name_data, sep='\t', error_bad_lines=False)

b'Skipping line 34802: expected 8 fields, saw 9\nSkipping line 34803: expected 8 fields, saw 9\nSkipping line 34804: expected 8 fields, saw 9\n'


In [4]:
data.head()

Unnamed: 0,ApplNo,ProductNo,Form,Strength,ReferenceDrug,DrugName,ActiveIngredient,ReferenceStandard
0,4,4,SOLUTION/DROPS;OPHTHALMIC,1%,0,PAREDRINE,HYDROXYAMPHETAMINE HYDROBROMIDE,0.0
1,159,1,TABLET;ORAL,500MG,0,SULFAPYRIDINE,SULFAPYRIDINE,0.0
2,552,1,INJECTABLE;INJECTION,"20,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
3,552,2,INJECTABLE;INJECTION,"40,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0
4,552,3,INJECTABLE;INJECTION,"5,000 UNITS/ML",0,LIQUAEMIN SODIUM,HEPARIN SODIUM,0.0


In [5]:
names = data['DrugName']
print('There are {} unique names.'.format(len(set(names))))
ingredients = data['ActiveIngredient']
print('There are {} unique ingredients.'.format(len(set(ingredients))))

There are 7198 unique names.
There are 2760 unique ingredients.


## Clean data for blacklisted words and regexes

### Blacklist: tokens, ingredients

Filter out simple words and weird phrases, as well as idiosyncratic occurrences of punctuation, numbers, and other characters.

In [6]:
# blacklist certain words associated with packaging
blacklist_words = [
    'in',
    'water',
    'plastic',
    'container',
    'w/',
    'preservative',
    'free',
    'kit',
    'and',
    'xr'
]

# tokenize all ingredients
ingredient_tokens = []
tokenized_ingredients = [ingredient_name.lower().split(' ') for ingredient_name in list(ingredients)]
for tokens in tokenized_ingredients:
    ingredient_tokens += tokens

# dedupe and sort all ingredient tokens
ingredient_tokens = sorted(list(set(ingredient_tokens)))

# add all ingredients to blacklist, e.g. sodium, hydrobromide, etc.
blacklist_words += ingredient_tokens

# create blacklist of banned tokens
blacklist = {}
for word in blacklist_words:
    blacklist[word] = True
def blacklisted_word(word):
    """Return True if word is in blacklisted words [ingredients & packaging words], else False."""
    return word.lower() in blacklist

In [7]:
# create a list of unique! (drug name, ingredient tokens) tuples
drug_list = []
seen = {}
for i in range(len(tokenized_ingredients)):
    ingredient_list = tokenized_ingredients[i]
    drug_name = names[i]
    if drug_name not in seen:
        drug_list.append((drug_name, ingredient_list))
        seen[drug_name] = True
    else:
        continue
        
drug_list = sorted(drug_list)

In [8]:
drug_list[:5]

[('8-HOUR BAYER', ['aspirin']),
 ('8-MOP', ['methoxsalen']),
 ('A-HYDROCORT', ['hydrocortisone', 'sodium', 'succinate']),
 ('A-METHAPRED', ['methylprednisolone', 'sodium', 'succinate']),
 ('A-N STANNOUS AGGREGATED ALBUMIN',
  ['technetium', 'tc-99m', 'albumin', 'aggregated', 'kit'])]

### Blacklist: regexps for non-alphanumeric tokens

In [9]:
blacklist_regex = [
    r'.*\d+.*',  # anything containing 0-9 digits
    r'.*%+.*',  # anything containing 1 or more % symbols
    r'.*\/+.*', # anything containing 1 or more forward slashes
    r'.*\-.*',  # anything containing hyphens
    r'.*\W.*',  # anything that is purely non-word chars (non alphanumeric)
    r'\b\w{1,3}\b',  # no 1-3 length strings
]
def blacklisted_regex(word):
    """Return True if word matches blacklisted regexps, else False."""
    for regex in blacklist_regex:
        if re.match(regex, word):
            return True
    return False

### Blacklist: regular English words and Scrabble words

In [10]:
import twl
import enchant

d = enchant.Dict('en_US')

In [11]:
def blacklisted_scrabble(word):
    """Return True if word is a standard Scrabble word, else False."""
    word = word.lower()
    return twl.check(word) and d.check(word) 

In [12]:
blacklist_funcs = [
    blacklisted_word,
    blacklisted_regex,
    blacklisted_scrabble
]

In [13]:
def check_blacklist(word):
    """Return True if word is in any blacklist, else False."""
    bools = [func(word) for func in blacklist_funcs]
    return not reduce(lambda x, y: x or y, bools)

In [14]:
def filter_name(name):
    """Filter a drug name that contains spaces, removing all blacklisted words."""
    # split by spaces
    tokens = name.split(' ')
    valid_tokens = filter(check_blacklist, tokens)
    clean_name = ' '.join(valid_tokens)
    return clean_name

### Filter names

In [15]:
# filter names
clean_names = [filter_name(name) for name, _ in drug_list]

# fuse all names
clean_names = ' '.join(clean_names).split()

# dedupe names
clean_names = sorted(list(set(clean_names)))

In [16]:
clean_names[:10]

['ABBOJECT',
 'ABELCET',
 'ABILIFY',
 'ABITREXATE',
 'ABLAVAR',
 'ABLYSINOL',
 'ABRAXANE',
 'ABREVA',
 'ABSORBASE',
 'ABSORICA']

In [17]:
len(clean_names)

3663

## Export names as readily-trainable data for an LSTM model

In [19]:
with open(
    os.path.expanduser('~/Dropbox/Workspace/projects/drug-name-generator/fda-drug-names.txt'),
    'w'
) as file:
    for name in clean_names:
        file.write(name + '\n')