In [3]:
import numpy as np
import sklearn.cluster
from Levenshtein import distance
import pickle

In [5]:
with open("./progress/wrangled/generic_name_unique.txt", "rb") as fp:   # Unpickling
    generic_name_unique = pickle.load(fp)

In [6]:
generic_name_unique

['CHROMIUM PICOLINATE',
 'FOSAPREPITANT DIMEGLUMINE',
 'PALIPERIDONE PALMITATE',
 'AZATHIOPRINE SODIUM',
 'ST JOHNS WORT',
 'RITUXIMAB',
 'DARBEPOETIN ALFA',
 'BURN RELIEF',
 'ACETAMINOPHEN ASPIRIN (NSAID) AND CAFFEINE',
 'MELALEUCA',
 'BORAGE OIL',
 'ZICONOTIDE',
 'DIPHENHYDRAMINE CITRATE AND IBUPROFEN',
 'MAGNESIUM SULFATE',
 'ZINC',
 'CAIRINA MOSCHATA HEART/LIVER AUTOLYSATE',
 'NITROFURANTOIN MONOHYDRATE AND NITROFURANTOIN MACROCRYSTALLINE',
 'CHARCOAL ACTIVATED',
 'LIDOCAINE HYDROCHLORIDEEPINEPHRINE BITARTRATE',
 'FULVESTRANT',
 'CHORIONIC GONADOTROPIN',
 'BENZOYL PEROXIDE',
 'ATORVASTATIN CALCIUM FILM COATED',
 'ATAZANAVIR SULFATE',
 'QUINUPRISTIN AND DALFOPRISTIN',
 'GALANTAMINE HYDROBROMIDE',
 'DIGITALIS',
 'PROBENECID',
 'OXYTOCIN',
 'AMLODIPINE BESYLATE AND VALSARTAN',
 'VANDETANIB',
 'CORN AND CALLUS REMOVER KIT',
 'FLUVOXAMINE MALEATE',
 'FOXGLOVE',
 'COUGH SYRUP',
 'MOXIFLOXACIN',
 'CLOCORTOLONE PIVALATE',
 'ACONITUM NAP AESCULUS HIPP FLOS ARSENICUM ALB AVENA BELLADONNA CAM

In [51]:
# https://stats.stackexchange.com/questions/123060/clustering-a-long-list-of-strings-words-into-similarity-groups
char_lim = 25

words_og = generic_name_unique

# Cut off all but the first set of characters
if char_lim is not None:
    words = [w[:char_lim] for w in words_og]
else:
    words = words_og

# Get similarity matrix
words = np.asarray(words) 
lev_similarity = -1*np.array([[distance(w1,w2) for w1 in words] for w2 in words])

# Fit an affinity model
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)

# Group the words with their respective clusters
cluster_list = []
gen_name_list = []
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))
    for name in cluster:
        cluster_list.append(exemplar)
        gen_name_list.append(name)

 - *MAGNESIUM SULFATE:*  LACHESIS  NAJA  NUX VOM, MAGNESIUM CITRATE, MAGNESIUM HYDROXIDE, MAGNESIUM OXIDE, MAGNESIUM SALICYLATE, MAGNESIUM SULFATE, MAGNESIUM SULFATE HEPTAHY, MAGNESIUM SULFATE UNSPECI, SELENIUM SULFIDE
 - *CHORIONIC GONADOTROPIN:* CHORIOGONADOTROPIN ALFA, CHORIONIC GONADOTROPIN, FIBRINOGEN HUMAN THROMBIN
 - *AMLODIPINE BESYLATE AND V:* AMLODIPINE BESYLATE, AMLODIPINE BESYLATE AND A, AMLODIPINE BESYLATE AND B, AMLODIPINE BESYLATE AND O, AMLODIPINE BESYLATE AND V
 - *PSEUDOEPHEDRINE HCL:* PHENYLEPHRINE HCL, PSEUDOEPHEDRINE, PSEUDOEPHEDRINE HCL, PSEUDOEPHEDRINE HYDROCHLO
 - *ONE STEP WART REMOVER:* ONE STEP CLEAR WART REMOV, ONE STEP CORN REMOVER, ONE STEP WART REMOVER, ONE STEP WART REMOVER CLE, ONE STEP WART REMOVER STR, ONE-STEP CORN REMOVER CLE, ONE-STEP WART REMOVER TAN
 - *BECLOMETHASONE DIPROPIONA:* BECLOMETHASONE DIPROPIONA, BETAMETHASONE DIPROPIONAT, DEXAMETHASONE SODIUM PHOS
 - *CAFFEINE MAGNESIUM SALICY:* CAFFEINE MAGNESIUM SALICY, CHOLINE MAGNESIUM TRISALI
 - 

In [52]:
gen_name_list

[' LACHESIS  NAJA  NUX VOM',
 'MAGNESIUM CITRATE',
 'MAGNESIUM HYDROXIDE',
 'MAGNESIUM OXIDE',
 'MAGNESIUM SALICYLATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE HEPTAHY',
 'MAGNESIUM SULFATE UNSPECI',
 'SELENIUM SULFIDE',
 'CHORIOGONADOTROPIN ALFA',
 'CHORIONIC GONADOTROPIN',
 'FIBRINOGEN HUMAN THROMBIN',
 'AMLODIPINE BESYLATE',
 'AMLODIPINE BESYLATE AND A',
 'AMLODIPINE BESYLATE AND B',
 'AMLODIPINE BESYLATE AND O',
 'AMLODIPINE BESYLATE AND V',
 'PHENYLEPHRINE HCL',
 'PSEUDOEPHEDRINE',
 'PSEUDOEPHEDRINE HCL',
 'PSEUDOEPHEDRINE HYDROCHLO',
 'ONE STEP CLEAR WART REMOV',
 'ONE STEP CORN REMOVER',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER CLE',
 'ONE STEP WART REMOVER STR',
 'ONE-STEP CORN REMOVER CLE',
 'ONE-STEP WART REMOVER TAN',
 'BECLOMETHASONE DIPROPIONA',
 'BETAMETHASONE DIPROPIONAT',
 'DEXAMETHASONE SODIUM PHOS',
 'CAFFEINE MAGNESIUM SALICY',
 'CHOLINE MAGNESIUM TRISALI',
 ' FLUOR',
 'ATENOLOL',
 'BENADRYL',
 'BREATHEZE',
 'CEFPROZIL',
 'DANAZOL',
 'ETHANOL',
 'FENTANYL'

In [53]:
cluster_list

['MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'MAGNESIUM SULFATE',
 'CHORIONIC GONADOTROPIN',
 'CHORIONIC GONADOTROPIN',
 'CHORIONIC GONADOTROPIN',
 'AMLODIPINE BESYLATE AND V',
 'AMLODIPINE BESYLATE AND V',
 'AMLODIPINE BESYLATE AND V',
 'AMLODIPINE BESYLATE AND V',
 'AMLODIPINE BESYLATE AND V',
 'PSEUDOEPHEDRINE HCL',
 'PSEUDOEPHEDRINE HCL',
 'PSEUDOEPHEDRINE HCL',
 'PSEUDOEPHEDRINE HCL',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER',
 'ONE STEP WART REMOVER',
 'BECLOMETHASONE DIPROPIONA',
 'BECLOMETHASONE DIPROPIONA',
 'BECLOMETHASONE DIPROPIONA',
 'CAFFEINE MAGNESIUM SALICY',
 'CAFFEINE MAGNESIUM SALICY',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',
 'MENTHOL',

In [55]:
gen_name_dict = dict(zip(gen_name_list, cluster_list))

In [60]:
result = [gen_name_dict[w[:char_lim]] for w in generic_name_unique if w[:char_lim] in gen_name_dict.keys()]

In [None]:
with open("./progress/wrangled/generic_name_unique.txt", "rb") as fp:   # Unpickling
    generic_name_unique = pickle.load(fp)

In [71]:
def entry_condenser(unique_col, char_lim = None):
    """
    Takes in a list of unique column entries and returns a dictionary to convert a col to lower-d space
    """
    # https://stats.stackexchange.com/questions/123060/clustering-a-long-list-of-strings-words-into-similarity-groups
    words_og = unique_col

    # Cut off all but the first set of characters
    if char_lim is not None:
        words = [w[:char_lim] for w in words_og]
    else:
        words = words_og

    # Get similarity matrix
    words = np.asarray(words) 
    lev_similarity = -1*np.array([[distance(w1,w2) for w1 in words] for w2 in words])

    # Fit an affinity model
    affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)

    # Group the words with their respective clusters
    cluster_list = []
    gen_name_list = []
    for cluster_id in np.unique(affprop.labels_):
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
        cluster_str = ", ".join(cluster)
        for name in cluster:
            cluster_list.append(exemplar)
            gen_name_list.append(name)
            
    gen_name_dict = dict(zip(gen_name_list, cluster_list))
    
    return gen_name_dict