In [None]:
!pip install pandas
!pip install numpy
!pip install seaborn 
!pip install zipfile

In [3]:
import requests
import pandas as pd
import seaborn as sns 
import numpy as np
import zipfile as zp
import re

In [4]:
data = pd.read_csv('merged_data.csv')

In [5]:
data.columns

Index(['Unnamed: 0', 'ndc_description', 'ndc', 'nadac_per_unit',
       'effective_date', 'pricing_unit', 'pharmacy_type_indicator', 'otc',
       'explanation_code', 'classification_for_rate_setting',
       'corresponding_generic_drug_nadac_per_unit',
       'corresponding_generic_drug_effective_date', 'as_of_date', 'year',
       'package_size', 'product_name', 'units_reimbursed',
       'number_of_prescriptions', 'total_amount_reimbursed',
       'medicaid_amount_reimbursed', 'non_medicaid_amount_reimbursed',
       'matching_product_ndc', 'PRODUCTID', 'PRODUCTNDC', 'PRODUCTTYPENAME',
       'PROPRIETARYNAME', 'PROPRIETARYNAMESUFFIX', 'NONPROPRIETARYNAME',
       'DOSAGEFORMNAME', 'ROUTENAME', 'STARTMARKETINGDATE', 'ENDMARKETINGDATE',
       'MARKETINGCATEGORYNAME', 'APPLICATIONNUMBER', 'LABELERNAME',
       'SUBSTANCENAME', 'ACTIVE_NUMERATOR_STRENGTH', 'ACTIVE_INGRED_UNIT',
       'PHARM_CLASSES', 'DEASCHEDULE', 'NDC_EXCLUDE_FLAG',
       'LISTING_RECORD_CERTIFIED_THROUGH', 'PRODU

#### Removing columns with no predictive power due to lack of generalizability (unique IDs)

In [6]:
data = data.drop(columns=['Unnamed: 0', 'as_of_date', 'PRODUCTNDC', 'PRODUCTNDC_no_hyphens', 'PRODUCTNDC_zeros', 'matching_product_ndc', 'PRODUCTID'])

#### Removing columns which are overly sparse (less than 1000 non-null vals

In [7]:
data = data.drop(columns=['corresponding_generic_drug_nadac_per_unit', 'corresponding_generic_drug_effective_date', 'PROPRIETARYNAMESUFFIX', 'DEASCHEDULE'])

#### Check highly correlated columns and remove one to avoid overcorrelated columns

In [8]:
correlation = data['classification_for_rate_setting'].corr(data['MARKETINGCATEGORYNAME'])

TypeError: unsupported operand type(s) for /: 'str' and 'int'

#### Adding separate month, day, and, year columns so each is a specialized feature and dropping effective date

In [9]:
data['effective_date'] = pd.to_datetime(data['effective_date'], errors='coerce')
data['month'] = data['effective_date'].dt.month
data['day'] = data['effective_date'].dt.day

#pre merge, I changed some to 2020 since that was what they were in other dataset, so changing back
data.loc[data['effective_date'].dt.year == 2019, 'year'] = 2019
data = data.drop(columns=['effective_date'])

#### Dealing with categorical data

In [10]:
unique_vals = set(data['pharmacy_type_indicator'].values)
print(unique_vals)
# there is only one value in every row, this feature can be removed
data = data.drop(columns=['pharmacy_type_indicator'])

{'C/I'}


In [11]:
unique_vals = set(data['classification_for_rate_setting'].values)
print(unique_vals)
# there are four possibles values for every row, this feature can be one_hot_encoded
encoded_df = pd.get_dummies(data['classification_for_rate_setting'], prefix='classification_for_rate', drop_first = False)
data = pd.concat([data, encoded_df], axis=1)
data.drop('classification_for_rate_setting', axis=1, inplace=True)

{'G', 'B', 'B-ANDA', 'B-BIO'}


In [12]:
data

Unnamed: 0,ndc_description,ndc,nadac_per_unit,pricing_unit,otc,explanation_code,year,package_size,product_name,units_reimbursed,...,PHARM_CLASSES,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH,FeatureVec,month,day,classification_for_rate_B,classification_for_rate_B-ANDA,classification_for_rate_B-BIO,classification_for_rate_G
0,GEODON 80 MG CAPSULE,49035860,29.01797,EA,N,1,2024,60,geodon 80m,11012.0,...,"Amide Local Anesthetic [EPC], Amides [CS], Ant...",N,20241231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,9,18,1,0,0,0
1,ZITHROMAX 250 MG Z-PAK TABLET,69306075,2.08064,EA,N,4,2022,75,zithromax,1022.0,...,"Centrally-mediated Muscle Relaxation [PE], Mus...",N,20251231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2,23,1,0,0,0
2,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081001,0.03721,EA,N,1,2019,1,hydrochlor,211969.0,...,,N,20241231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,12,18,0,0,0,1
3,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081005,0.03721,EA,N,1,2019,5,hydrochlor,2393477.0,...,,N,20241231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,12,18,0,0,0,1
4,CLOZAPINE 100 MG TABLET,378086001,0.57622,EA,N,1,2019,1,clozapine,2589373.0,...,"Anti-Inflammatory Agents, Non-Steroidal [CS], ...",N,20251231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,12,18,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20006,FOSAPREPITANT 150 MG VIAL,83634077610,38.50800,EA,N,"1, 5",2024,10,fosaprepit,17.0,...,"Cytochrome P450 2C9 Inducers [MoA], Cytochrome...",N,20251231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,8,21,0,0,0,1
20007,FLUPHENAZINE 2.5 MG TABLET,90096012201,2.30140,EA,N,4,2022,1,fluphenazi,2402.0,...,"Phenothiazine [EPC], Phenothiazines [CS]",N,20251231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,13,0,0,0,1
20008,FLUPHENAZINE 5 MG TABLET,90096012301,2.98385,EA,N,1,2022,1,fluphenazi,585.0,...,"Phenothiazine [EPC], Phenothiazines [CS]",N,20251231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,13,0,0,0,1
20009,FLUPHENAZINE 10 MG TABLET,90096012401,3.52974,EA,N,1,2022,1,fluphenazi,3350.0,...,"Phenothiazine [EPC], Phenothiazines [CS]",N,20251231.0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,13,0,0,0,1


In [13]:
unique_vals = set(data['pricing_unit'].values)
print(unique_vals)
# there are three possibles values for every row, this feature can be one_hot_encoded
encoded_df = pd.get_dummies(data['pricing_unit'], prefix='pricing', drop_first = False)
data = pd.concat([data, encoded_df], axis=1)
data.drop('pricing_unit', axis=1, inplace=True) 

{'GM', 'ML', 'EA'}


In [14]:
data

Unnamed: 0,ndc_description,ndc,nadac_per_unit,otc,explanation_code,year,package_size,product_name,units_reimbursed,number_of_prescriptions,...,FeatureVec,month,day,classification_for_rate_B,classification_for_rate_B-ANDA,classification_for_rate_B-BIO,classification_for_rate_G,pricing_EA,pricing_GM,pricing_ML
0,GEODON 80 MG CAPSULE,49035860,29.01797,N,1,2024,60,geodon 80m,11012.0,180,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,9,18,1,0,0,0,1,0,0
1,ZITHROMAX 250 MG Z-PAK TABLET,69306075,2.08064,N,4,2022,75,zithromax,1022.0,183,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,2,23,1,0,0,0,1,0,0
2,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081001,0.03721,N,1,2019,1,hydrochlor,211969.0,6167,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,12,18,0,0,0,1,1,0,0
3,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081005,0.03721,N,1,2019,5,hydrochlor,2393477.0,61521,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,12,18,0,0,0,1,1,0,0
4,CLOZAPINE 100 MG TABLET,378086001,0.57622,N,1,2019,1,clozapine,2589373.0,36643,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,12,18,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20006,FOSAPREPITANT 150 MG VIAL,83634077610,38.50800,N,"1, 5",2024,10,fosaprepit,17.0,17,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,8,21,0,0,0,1,1,0,0
20007,FLUPHENAZINE 2.5 MG TABLET,90096012201,2.30140,N,4,2022,1,fluphenazi,2402.0,36,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,13,0,0,0,1,1,0,0
20008,FLUPHENAZINE 5 MG TABLET,90096012301,2.98385,N,1,2022,1,fluphenazi,585.0,11,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,13,0,0,0,1,1,0,0
20009,FLUPHENAZINE 10 MG TABLET,90096012401,3.52974,N,1,2022,1,fluphenazi,3350.0,56,...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...,4,13,0,0,0,1,1,0,0


In [15]:
unique_vals = set(data['MARKETINGCATEGORYNAME'].values)
print(unique_vals)
'''This has 10 values, if we one hot encode and add each as a new col,
this might overfit since we don't have that many rows to begin with, 
design decision needs to be made here: we can store one col with a
corresponding int, but this assumes some implicit relation between the 
categories, which might not be true, possible steps below:
- Set col to a target int in range 0-9, 
    - Issue: assuming implicit relation, linear model will treat dist as 
    having importance
- Continue one-hot encoding
    - Issue: might overfit since we add 10 cols to a dataset that alr does
    not have many cols
- We could learn embeddings for each of the categories with a neural net and store
these
    - I don't know any clear issues, but we would do this pre model
*** FOR NOW, I ONE HOT ENCODED '''
encoded_df = pd.get_dummies(data['MARKETINGCATEGORYNAME'], prefix='marketing_category', drop_first = False)
data = pd.concat([data, encoded_df], axis=1)
data.drop('MARKETINGCATEGORYNAME', axis=1, inplace=True) 


{'UNAPPROVED DRUG FOR USE IN DRUG SHORTAGE', 'OTC MONOGRAPH DRUG', 'BLA', 'OTC MONOGRAPH NOT FINAL', 'UNAPPROVED DRUG OTHER', 'ANDA', 'OTC MONOGRAPH FINAL', 'UNAPPROVED HOMEOPATHIC', 'NDA', 'NDA AUTHORIZED GENERIC'}


In [16]:
data

Unnamed: 0,ndc_description,ndc,nadac_per_unit,otc,explanation_code,year,package_size,product_name,units_reimbursed,number_of_prescriptions,...,marketing_category_ANDA,marketing_category_BLA,marketing_category_NDA,marketing_category_NDA AUTHORIZED GENERIC,marketing_category_OTC MONOGRAPH DRUG,marketing_category_OTC MONOGRAPH FINAL,marketing_category_OTC MONOGRAPH NOT FINAL,marketing_category_UNAPPROVED DRUG FOR USE IN DRUG SHORTAGE,marketing_category_UNAPPROVED DRUG OTHER,marketing_category_UNAPPROVED HOMEOPATHIC
0,GEODON 80 MG CAPSULE,49035860,29.01797,N,1,2024,60,geodon 80m,11012.0,180,...,0,0,0,0,0,0,1,0,0,0
1,ZITHROMAX 250 MG Z-PAK TABLET,69306075,2.08064,N,4,2022,75,zithromax,1022.0,183,...,1,0,0,0,0,0,0,0,0,0
2,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081001,0.03721,N,1,2019,1,hydrochlor,211969.0,6167,...,0,0,0,0,1,0,0,0,0,0
3,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081005,0.03721,N,1,2019,5,hydrochlor,2393477.0,61521,...,0,0,0,0,1,0,0,0,0,0
4,CLOZAPINE 100 MG TABLET,378086001,0.57622,N,1,2019,1,clozapine,2589373.0,36643,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20006,FOSAPREPITANT 150 MG VIAL,83634077610,38.50800,N,"1, 5",2024,10,fosaprepit,17.0,17,...,1,0,0,0,0,0,0,0,0,0
20007,FLUPHENAZINE 2.5 MG TABLET,90096012201,2.30140,N,4,2022,1,fluphenazi,2402.0,36,...,1,0,0,0,0,0,0,0,0,0
20008,FLUPHENAZINE 5 MG TABLET,90096012301,2.98385,N,1,2022,1,fluphenazi,585.0,11,...,1,0,0,0,0,0,0,0,0,0
20009,FLUPHENAZINE 10 MG TABLET,90096012401,3.52974,N,1,2022,1,fluphenazi,3350.0,56,...,1,0,0,0,0,0,0,0,0,0


In [17]:
unique_vals = set(data['DOSAGEFORMNAME'].values)
print(len(unique_vals))
print(unique_vals)
'''This has 97 values, same issue as above
- Set col to a target int in range 0-9, 
    - Issue: assuming implicit relation, linear model will treat dist as 
    having importance
- Continue one-hot encoding
    - Issue: might overfit since we add 10 cols to a dataset that alr does
    not have many cols
- We could learn embeddings for each of the categories with a neural net and store
these
    - I don't know any clear issues, but we would do this pre model
*** FOR NOW, I ONE HOT ENCODED '''
encoded_df = pd.get_dummies(data['DOSAGEFORMNAME'], prefix='dosage_form', drop_first = False)
data = pd.concat([data, encoded_df], axis=1)
data.drop('DOSAGEFORMNAME', axis=1, inplace=True)

97
{'PASTE, DENTIFRICE', 'INJECTION, POWDER, LYOPHILIZED, FOR SOLUTION', 'SHAMPOO', 'CAPSULE, GELATIN COATED', 'GRANULE, FOR SUSPENSION, EXTENDED RELEASE', 'TABLET, MULTILAYER, EXTENDED RELEASE', 'FOR SOLUTION', 'SUSPENSION', 'TABLET, SUGAR COATED', 'GEL, METERED', 'GRANULE, DELAYED RELEASE', 'SOLUTION', 'SOLUTION, GEL FORMING / DROPS', 'INJECTION, POWDER, FOR SOLUTION', 'JELLY', 'AEROSOL, FOAM', 'SUSPENSION/ DROPS', 'PASTILLE', 'SYSTEM', 'SPRAY', 'CAPSULE, DELAYED RELEASE', 'TABLET, CHEWABLE', 'INJECTION, EMULSION', 'FOR SUSPENSION', 'ELIXIR', 'SPRAY, METERED', 'AEROSOL, METERED', 'FILM, EXTENDED RELEASE', 'GEL, DENTIFRICE', 'CREAM', 'AEROSOL, SPRAY', 'TABLET, EFFERVESCENT', 'SWAB', 'LIQUID', 'POWDER', 'CREAM, AUGMENTED', 'INJECTION, SOLUTION', 'CAPSULE, COATED PELLETS', 'CLOTH', 'TABLET, FOR SUSPENSION', 'FILM, SOLUBLE', 'TABLET, ORALLY DISINTEGRATING, DELAYED RELEASE', 'POWDER, METERED', 'SUPPOSITORY', 'POWDER, FOR SUSPENSION', 'OIL', 'MOUTHWASH', 'GRANULE, FOR SUSPENSION', 'CONCENT

In [18]:
unique_vals = set(data['ROUTENAME'].values)
print(len(unique_vals))
print(unique_vals)
'''This has 97 values, same issue as above
- Set col to a target int in range 0-9, 
    - Issue: assuming implicit relation, linear model will treat dist as 
    having importance
- Continue one-hot encoding
    - Issue: might overfit since we add 10 cols to a dataset that alr does
    not have many cols
- We could learn embeddings for each of the categories with a neural net and store
these
    - I don't know any clear issues, but we would do this pre model
*** FOR NOW, I ONE HOT ENCODED '''
encoded_df = pd.get_dummies(data['ROUTENAME'], prefix='route', drop_first = False)
data = pd.concat([data, encoded_df], axis=1)
data.drop('ROUTENAME', axis=1, inplace=True)

50
{'INTRA-ARTICULAR; INTRALESIONAL; INTRAMUSCULAR; SOFT TISSUE', 'OPHTHALMIC', 'INTRAOCULAR', 'INTRAVENOUS; PARENTERAL', 'BUCCAL', 'INTRADERMAL', 'CUTANEOUS', 'INTRAVENOUS', 'INTRA-ARTICULAR; INTRALESIONAL; INTRAMUSCULAR; INTRAVENOUS; SOFT TISSUE', 'SUBCUTANEOUS', 'RECTAL', 'ORAL; TOPICAL', 'PARENTERAL', 'TRANSDERMAL', 'ORAL; RECTAL', 'INTRAVITREAL', 'INTRA-ARTERIAL; INTRAMUSCULAR; INTRATHECAL; INTRAVENOUS', 'INTRA-ARTICULAR; INTRAMUSCULAR; SOFT TISSUE', 'RECTAL; TOPICAL', 'INTRA-ARTERIAL; INTRAMUSCULAR', 'INTRA-ARTERIAL; INTRAMUSCULAR; INTRAVENOUS', 'INTRAVENOUS; SUBCUTANEOUS', 'OROPHARYNGEAL', 'RESPIRATORY (INHALATION)', 'TOPICAL', 'EPIDURAL; INFILTRATION; INTRACAUDAL; PERINEURAL', 'INTRA-ARTERIAL; INTRALESIONAL; INTRAMUSCULAR; SOFT TISSUE', nan, 'ORAL; ORAL', 'INTRAMUSCULAR; SUBCUTANEOUS', 'INTRA-ARTICULAR; INTRALESIONAL; INTRAMUSCULAR; INTRASYNOVIAL; SOFT TISSUE', 'BUCCAL; SUBLINGUAL', 'SUBLINGUAL', 'ORAL; SUBLINGUAL', 'PERINEURAL', 'ORAL', 'INTRABRONCHIAL', 'ORAL; RESPIRATORY (IN

In [19]:
unique_vals = set(data['PHARM_CLASSES'].values)
# print(len(unique_vals))
# print(unique_vals)
'''This data I think is formatted badly and has like 600 unique vals, so I made a new col for each bracket entry
so that the data is more standardized and easier to encode. EG: instead of there being a str:
Osmotic Laxative [EPC], Osmotic Activity [MoA], Increased Large Intestinal Motility [PE], there
will be a new column for EPC, MoA, and PE that contains this data'''

def extract_pharm_classes(row):
    if not isinstance(row, str):  
        return {}
    classes = re.findall(r'([^\[\],]+) \[(\w+)\]', row)
    class_dict = {}
    for value, key in classes:
        key = key.strip()  
        value = value.strip()  
        if key not in class_dict:
            class_dict[key] = []
        class_dict[key].append(value)
    return {k: ', '.join(v) for k, v in class_dict.items()}  

extracted_data = data['PHARM_CLASSES'].apply(extract_pharm_classes)
extracted_df = pd.json_normalize(extracted_data)
data = pd.concat([data, extracted_df], axis=1)
data.drop('PHARM_CLASSES', axis=1, inplace=True)

In [20]:
unique_epc = set(data['EPC'].values)
print(len(unique_epc))
unique_ext = set(data['EXT'].values)
print(len(unique_ext))
unique_moa = set(data['MoA'].values)
print(len(unique_moa))
unique_cs = set(data['CS'].values)
print(len(unique_cs))
unique_pe = set(data['PE'].values)
print(len(unique_pe))
# damn they are all still too large to encode, tough, will figure out later, could NN embed

457
4
261
151
96


In [21]:
data

Unnamed: 0,ndc_description,ndc,nadac_per_unit,otc,explanation_code,year,package_size,product_name,units_reimbursed,number_of_prescriptions,...,route_SUBCUTANEOUS,route_SUBLINGUAL,route_TOPICAL,route_TRANSDERMAL,route_VAGINAL,EPC,CS,PE,MoA,EXT
0,GEODON 80 MG CAPSULE,49035860,29.01797,N,1,2024,60,geodon 80m,11012.0,180,...,0,0,1,0,0,"Amide Local Anesthetic, Antiarrhythmic",Amides,Local Anesthesia,,
1,ZITHROMAX 250 MG Z-PAK TABLET,69306075,2.08064,N,4,2022,75,zithromax,1022.0,183,...,0,0,0,0,0,Muscle Relaxant,,Centrally-mediated Muscle Relaxation,,
2,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081001,0.03721,N,1,2019,1,hydrochlor,211969.0,6167,...,0,0,0,0,0,,,,,
3,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081005,0.03721,N,1,2019,5,hydrochlor,2393477.0,61521,...,0,0,0,0,0,,,,,
4,CLOZAPINE 100 MG TABLET,378086001,0.57622,N,1,2019,1,clozapine,2589373.0,36643,...,0,0,0,0,0,"Nonsteroidal Anti-inflammatory Drug, Platelet ...",Non-Steroidal,"Decreased Platelet Aggregation, Decreased Pros...",Cyclooxygenase Inhibitors,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20006,FOSAPREPITANT 150 MG VIAL,83634077610,38.50800,N,"1, 5",2024,10,fosaprepit,17.0,17,...,0,0,0,0,0,Substance P/Neurokinin-1 Receptor Antagonist,,,"Cytochrome P450 2C9 Inducers, Cytochrome P450 ...",
20007,FLUPHENAZINE 2.5 MG TABLET,90096012201,2.30140,N,4,2022,1,fluphenazi,2402.0,36,...,0,0,0,0,0,Phenothiazine,Phenothiazines,,,
20008,FLUPHENAZINE 5 MG TABLET,90096012301,2.98385,N,1,2022,1,fluphenazi,585.0,11,...,0,0,0,0,0,Phenothiazine,Phenothiazines,,,
20009,FLUPHENAZINE 10 MG TABLET,90096012401,3.52974,N,1,2022,1,fluphenazi,3350.0,56,...,0,0,0,0,0,Phenothiazine,Phenothiazines,,,


In [22]:
unique_vals = set(data['otc'].values)
data['otc'] = data['otc'].map({'Y': 1, 'N': 0})

In [23]:
data.drop('NDC_EXCLUDE_FLAG', axis=1, inplace=True)

In [24]:
data

Unnamed: 0,ndc_description,ndc,nadac_per_unit,otc,explanation_code,year,package_size,product_name,units_reimbursed,number_of_prescriptions,...,route_SUBCUTANEOUS,route_SUBLINGUAL,route_TOPICAL,route_TRANSDERMAL,route_VAGINAL,EPC,CS,PE,MoA,EXT
0,GEODON 80 MG CAPSULE,49035860,29.01797,0,1,2024,60,geodon 80m,11012.0,180,...,0,0,1,0,0,"Amide Local Anesthetic, Antiarrhythmic",Amides,Local Anesthesia,,
1,ZITHROMAX 250 MG Z-PAK TABLET,69306075,2.08064,0,4,2022,75,zithromax,1022.0,183,...,0,0,0,0,0,Muscle Relaxant,,Centrally-mediated Muscle Relaxation,,
2,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081001,0.03721,0,1,2019,1,hydrochlor,211969.0,6167,...,0,0,0,0,0,,,,,
3,HYDROCHLOROTHIAZIDE 12.5 MG CP,378081005,0.03721,0,1,2019,5,hydrochlor,2393477.0,61521,...,0,0,0,0,0,,,,,
4,CLOZAPINE 100 MG TABLET,378086001,0.57622,0,1,2019,1,clozapine,2589373.0,36643,...,0,0,0,0,0,"Nonsteroidal Anti-inflammatory Drug, Platelet ...",Non-Steroidal,"Decreased Platelet Aggregation, Decreased Pros...",Cyclooxygenase Inhibitors,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20006,FOSAPREPITANT 150 MG VIAL,83634077610,38.50800,0,"1, 5",2024,10,fosaprepit,17.0,17,...,0,0,0,0,0,Substance P/Neurokinin-1 Receptor Antagonist,,,"Cytochrome P450 2C9 Inducers, Cytochrome P450 ...",
20007,FLUPHENAZINE 2.5 MG TABLET,90096012201,2.30140,0,4,2022,1,fluphenazi,2402.0,36,...,0,0,0,0,0,Phenothiazine,Phenothiazines,,,
20008,FLUPHENAZINE 5 MG TABLET,90096012301,2.98385,0,1,2022,1,fluphenazi,585.0,11,...,0,0,0,0,0,Phenothiazine,Phenothiazines,,,
20009,FLUPHENAZINE 10 MG TABLET,90096012401,3.52974,0,1,2022,1,fluphenazi,3350.0,56,...,0,0,0,0,0,Phenothiazine,Phenothiazines,,,


In [25]:
data.to_csv('updated.csv')

## Creating Embeddings for Pharma Classes and NDC Descriptions using Sentence Transformers

In [28]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/8b/c8/990e22a465e4771338da434d799578865d6d7ef1fdb50bd844b7ecdcfa19/sentence_transformers-3.3.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Obtaining dependency information for transformers<5.0.0,>=4.41.0 from https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl.metadata
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Obtaining dependency information for huggingface-hub>=0.20.0 from https://files.pythonhosted.org/packages/95/9b/3068f

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2') 

In [47]:
def compute_embedding(cat_list):
    if isinstance(cat_list, list):
        embeddings = [model.encode(entry) for entry in cat_list]
        return np.mean(embeddings, axis=0)
    else:
        return None

In [53]:
def compute_ndc_embedding(description):
    return model.encode(description)

In [48]:
def parse_entry(entry):
    if isinstance(entry, str):
        return [item.strip() for item in entry.split(',')]
    elif isinstance(entry, list):
        return entry
    else:
        return None  

In [49]:
classes = ['EPC', 'CS', 'PE', 'MoA', 'EXT']

In [37]:
for entry in classes:
    data[entry] = data[entry].apply(parse_entry)

In [50]:
from tqdm.notebook import tqdm
tqdm.pandas()

for entry in classes:
    data[entry + 'embed'] = data[entry].progress_apply(compute_embedding)

  0%|          | 0/20011 [00:00<?, ?it/s]

  0%|          | 0/20011 [00:00<?, ?it/s]

  0%|          | 0/20011 [00:00<?, ?it/s]

  0%|          | 0/20011 [00:00<?, ?it/s]

  0%|          | 0/20011 [00:00<?, ?it/s]

In [54]:
data['ndc_desc_emb'] = data['ndc_description'].progress_apply(compute_ndc_embedding)

  0%|          | 0/20011 [00:00<?, ?it/s]

#### Note that these embeddings produced are dimension 384, if we try to run linear regression they will dominate the other columns and features. We can use PCA to reduce the dimensionality of these embeddings as one option or use more complex models and skip linear regression.

## Making a copy of data to drop extraneous categorical rows

In [91]:
data_quant = data.copy()

In [92]:
data_quant.drop(columns=['ndc', 'ndc_description', 'EPC', 'CS', 'PE', 'MoA', 'EXT', 'product_name'], inplace=True)

In [93]:
data_quant.to_csv('feature_data.csv')

KeyboardInterrupt: 

In [94]:
print(data_quant.columns)

Index(['nadac_per_unit', 'otc', 'explanation_code', 'year', 'package_size',
       'units_reimbursed', 'number_of_prescriptions',
       'total_amount_reimbursed', 'medicaid_amount_reimbursed',
       'non_medicaid_amount_reimbursed',
       ...
       'route_SUBLINGUAL', 'route_TOPICAL', 'route_TRANSDERMAL',
       'route_VAGINAL', 'EPCembed', 'CSembed', 'PEembed', 'MoAembed',
       'EXTembed', 'ndc_desc_emb'],
      dtype='object', length=193)


In [70]:
data_quant

Unnamed: 0,nadac_per_unit,otc,explanation_code,year,package_size,units_reimbursed,number_of_prescriptions,total_amount_reimbursed,medicaid_amount_reimbursed,non_medicaid_amount_reimbursed,...,route_SUBLINGUAL,route_TOPICAL,route_TRANSDERMAL,route_VAGINAL,EPCembed,CSembed,PEembed,MoAembed,EXTembed,ndc_desc_emb
0,29.01797,0,1,2024,60,11012.0,180,312410.53,290516.00,21894.53,...,0,1,0,0,"[-0.03303871, 0.00667294, -0.021961331, 0.0228...","[-0.043458767, -0.012143133, -0.028138781, 0.0...","[0.02637117, 0.021051187, -0.00019766699, 0.03...",,,"[-0.06438778, 0.024081187, 0.025814006, 0.0019..."
1,2.08064,0,4,2022,75,1022.0,183,3242.93,3226.72,16.21,...,0,0,0,0,"[-0.05030161, 0.030173745, -0.039565608, 0.072...",,"[0.029134838, -0.08867282, 0.068927705, 0.0445...",,,"[-0.0797675, 0.08760375, -0.06344953, -0.04261..."
2,0.03721,0,1,2019,1,211969.0,6167,22643.53,21684.04,959.49,...,0,0,0,0,,,,,,"[-0.04260633, 0.08583381, -0.093360834, -0.009..."
3,0.03721,0,1,2019,5,2393477.0,61521,285542.60,278197.21,7345.39,...,0,0,0,0,,,,,,"[-0.04260633, 0.08583381, -0.093360834, -0.009..."
4,0.57622,0,1,2019,1,2589373.0,36643,1797460.34,1747856.31,49604.03,...,0,0,0,0,"[-0.009911468, 0.0055803265, -0.020000152, 0.0...","[0.013976374, 0.0225846, -0.069378324, -0.0085...","[-0.01796911, 0.010103727, -0.027078105, -0.00...","[-0.07781293, 0.04164032, -0.10795523, -0.0139...",,"[0.03504522, 0.015969142, -0.05325891, -0.1042..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20006,38.50800,0,"1, 5",2024,10,17.0,17,335.72,335.72,0.00,...,0,0,0,0,"[-0.058145504, -0.061000016, -0.05172419, 0.02...",,,"[-0.09779025, -0.032893844, -0.01081433, -0.03...",,"[0.059947435, 0.06513382, -0.06388991, -0.0293..."
20007,2.30140,0,4,2022,1,2402.0,36,4476.00,4475.00,1.00,...,0,0,0,0,"[-0.052654374, 0.03590369, -0.029726284, 0.041...","[-0.0530932, 0.020536307, -0.04406372, 0.04932...",,,,"[0.073880926, 0.010453613, -0.06618901, -0.042..."
20008,2.98385,0,1,2022,1,585.0,11,1158.57,1158.57,0.00,...,0,0,0,0,"[-0.052654374, 0.03590369, -0.029726284, 0.041...","[-0.0530932, 0.020536307, -0.04406372, 0.04932...",,,,"[0.05517018, 0.01762526, -0.060653675, -0.0488..."
20009,3.52974,0,1,2022,1,3350.0,56,7713.98,7484.28,229.70,...,0,0,0,0,"[-0.052654374, 0.03590369, -0.029726284, 0.041...","[-0.0530932, 0.020536307, -0.04406372, 0.04932...",,,,"[0.042986672, 0.029119378, -0.09341928, -0.065..."


In [95]:
object_columns = data_quant.select_dtypes(include=['object']).columns
print(object_columns)

Index(['explanation_code', 'PRODUCTTYPENAME', 'PROPRIETARYNAME',
       'NONPROPRIETARYNAME', 'APPLICATIONNUMBER', 'LABELERNAME',
       'SUBSTANCENAME', 'ACTIVE_NUMERATOR_STRENGTH', 'ACTIVE_INGRED_UNIT',
       'FeatureVec', 'EPCembed', 'CSembed', 'PEembed', 'MoAembed', 'EXTembed',
       'ndc_desc_emb'],
      dtype='object')


In [78]:
for col in object_columns:
    print(f"Column: {col}")
    print(data[col].head())  
    print()

Column: explanation_code
0    1
1    4
2    1
3    1
4    1
Name: explanation_code, dtype: object

Column: PRODUCTTYPENAME
0             HUMAN OTC DRUG
1    HUMAN PRESCRIPTION DRUG
2             HUMAN OTC DRUG
3             HUMAN OTC DRUG
4             HUMAN OTC DRUG
Name: PRODUCTTYPENAME, dtype: object

Column: PROPRIETARYNAME
0    Equate Pain Relieving Cream Lidocaine
1            Cyclobenzaprine Hydrochloride
2                  HEB Lubricant Eye Drops
3                  HEB Lubricant Eye Drops
4                            Aspirin 81 mg
Name: PROPRIETARYNAME, dtype: object

Column: NONPROPRIETARYNAME
0                        Lidocaine
1    Cyclobenzaprine Hydrochloride
2    carboxymethylcellulose sodium
3    carboxymethylcellulose sodium
4                          Aspirin
Name: NONPROPRIETARYNAME, dtype: object

Column: APPLICATIONNUMBER
0       part348
1    ANDA078722
2          M018
3          M018
4          M013
Name: APPLICATIONNUMBER, dtype: object

Column: LABELERNAME
0    Wal

## Removing columns that have been deemed irrelevant, unneccessary, and still categorical: explanation_code, PRODUCTTYPENAME, PROPRIETARYNAME, NONPROPRIETARYNAME, APPLICATIONNUMBER, LABELERNAME, SUBSTANCENAME, ACTIVE_INGRED_UNIT

In [96]:
data_quant.drop(columns=['explanation_code', 'PRODUCTTYPENAME', 'PROPRIETARYNAME', 'NONPROPRIETARYNAME', 'APPLICATIONNUMBER', 'LABELERNAME', 'SUBSTANCENAME', 'ACTIVE_INGRED_UNIT'], inplace=True)

In [97]:
object_columns = data_quant.select_dtypes(include=['object']).columns
print(object_columns)

Index(['ACTIVE_NUMERATOR_STRENGTH', 'FeatureVec', 'EPCembed', 'CSembed',
       'PEembed', 'MoAembed', 'EXTembed', 'ndc_desc_emb'],
      dtype='object')


In [100]:
data_quant['ACTIVE_NUMERATOR_STRENGTH'] = data_quant['ACTIVE_NUMERATOR_STRENGTH'].apply(
    lambda x: np.array([x]) if isinstance(x, (int, float)) else np.array(x) if isinstance(x, list) else np.array([])
)

for entry in classes:
    data_quant[entry + 'embed'] = data_quant[entry + 'embed'].apply(lambda x: np.array(x) if isinstance(x, list) else np.array([]))

In [110]:
data_quant['FeatureVec'].iloc[0]

'[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n 0 0 0 0 0 0 0 0 0 0 0 0 0]'

In [114]:
def parse_feature(value):
    if value is None or value.strip() == "":
        return np.array([])  
    value = value.strip("[]")  
    if value.strip():  
        return np.array(list(map(float, value.split())), dtype=float)  # Convert to array of floats
    return np.array([]) 

data_quant['FeatureVec'] = data_quant['FeatureVec'].apply(parse_feature)


In [115]:
data_quant['FeatureVec'].apply(type).value_counts()

<class 'numpy.ndarray'>    20011
Name: FeatureVec, dtype: int64