## Load Dataset

In [9]:
import pandas as pd
import numpy as np
import janitor
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import os

  from imghdr import tests


In [10]:
import warnings

# Suppress warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)

In [None]:
amp_2018 = pd.read_csv(
    "C:\\Users\\shiri\\OneDrive\\Documents\\Python\\ml-projects\\data-606\\pharmaceutical-drug-price-prediction\\data\\raw\\AMP_2018.csv"
)
amp_2019 = pd.read_csv(
    "C:\\Users\\shiri\\OneDrive\\Documents\\Python\\ml-projects\\data-606\\pharmaceutical-drug-price-prediction\\data\\raw\\AMP_2019.csv"
)
amp_2020 = pd.read_csv(
    "C:\\Users\\shiri\\OneDrive\\Documents\\Python\\ml-projects\\data-606\\pharmaceutical-drug-price-prediction\\data\\raw\\AMP_2020.csv"
)
amp_2021 = pd.read_csv(
    "C:\\Users\\shiri\\OneDrive\\Documents\\Python\\ml-projects\\data-606\\pharmaceutical-drug-price-prediction\\data\\raw\\AMP_2021.csv"
)
amp_2022 = pd.read_csv(
    "C:\\Users\\shiri\\OneDrive\\Documents\\Python\\ml-projects\\data-606\\pharmaceutical-drug-price-prediction\\data\\raw\\AMP_2022.csv"
)

**Average Manufacturer Price (AMP)**  datasets from 2018 to 2022 are used to support analysis related to *ACA Full Medicaid Federal Upper Limits (FUL)*. These datasets help in evaluating reimbursement ceilings for multi-source drugs under Medicaid and enable tracking of pricing trends across multiple years.

## Basic Exploration

In [23]:
amp_2018.info()
amp_2019.info()
amp_2020.info()
amp_2021.info()
amp_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207663 entries, 0 to 207662
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Product Group             207663 non-null  int64  
 1   Ingredient                207663 non-null  object 
 2   Strength                  207663 non-null  object 
 3   Dosage                    207663 non-null  object 
 4   Route                     207663 non-null  object 
 5   MDR Unit Type             207663 non-null  object 
 6   Weighted Average of AMPs  207663 non-null  float64
 7   ACA FUL                   207663 non-null  float64
 8   Package Size              207663 non-null  float64
 9   NDC                       207663 non-null  object 
 10  Year                      207663 non-null  int64  
 11  Month                     207663 non-null  int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 19.0+ MB
<class 'pandas.core.frame.DataFrame'>
Rang

In [24]:
print("AMP 2018 Shape:", amp_2018.shape)
print("AMP 2019 Shape:", amp_2019.shape)
print("AMP 2020 Shape:", amp_2020.shape)
print("AMP 2021 Shape:", amp_2021.shape)
print("AMP 2022 Shape:", amp_2022.shape)

AMP 2018 Shape: (207663, 12)
AMP 2019 Shape: (207359, 12)
AMP 2020 Shape: (207940, 12)
AMP 2021 Shape: (216515, 12)
AMP 2022 Shape: (218080, 12)


In [25]:
# Print column names for amp DataFrames
print(amp_2018.columns)
print(amp_2019.columns)
print(amp_2020.columns)
print(amp_2021.columns)
print(amp_2022.columns)


Index(['Product Group', 'Ingredient', 'Strength', 'Dosage', 'Route',
       'MDR Unit Type', 'Weighted Average of AMPs', 'ACA FUL', 'Package Size',
       'NDC', 'Year', 'Month'],
      dtype='object')
Index(['Product Group', 'Ingredient', 'Strength', 'Dosage', 'Route',
       'MDR Unit Type', 'Weighted Average of AMPs', 'ACA FUL', 'Package Size',
       'NDC', 'Year', 'Month'],
      dtype='object')
Index(['Product Group', 'Ingredient', 'Strength', 'Dosage', 'Route',
       'MDR Unit Type', 'Weighted Average of AMPs', 'ACA FUL', 'Package Size',
       'NDC', 'Year', 'Month'],
      dtype='object')
Index(['Product Group', 'Ingredient', 'Strength', 'Dosage', 'Route',
       'MDR Unit Type', 'Weighted Average of AMPs', 'ACA FUL', 'Package Size',
       'NDC', 'Year', 'Month'],
      dtype='object')
Index(['Product Group', 'Ingredient', 'Strength', 'Dosage', 'Route',
       'MDR Unit Type', 'Weighted Average of AMPs', 'ACA FUL', 'Package Size',
       'NDC', 'Year', 'Month'],
      dtype=

In [None]:
# Print the count of unique values for each column in amp DataFrames
print(amp_2018.nunique())
print(amp_2019.nunique())
print(amp_2020.nunique())
print(amp_2021.nunique())
print(amp_2022.nunique())

Product Group                1366
Ingredient                    503
Strength                      418
Dosage                         77
Route                          14
MDR Unit Type                   7
Weighted Average of AMPs    14681
ACA FUL                     14103
Package Size                  105
NDC                         53362
Year                            1
Month                          12
dtype: int64
Product Group                1408
Ingredient                    516
Strength                      425
Dosage                         76
Route                          14
MDR Unit Type                   7
Weighted Average of AMPs    13792
ACA FUL                     13229
Package Size                  103
NDC                         37190
Year                            1
Month                          12
dtype: int64
Product Group                1498
Ingredient                    533
Strength                      446
Dosage                         82
Route                 

Over the years from **2018 to 2022**, we see an increase in the number of unique values across most columns, showing that the variety of drug products has grown. The number of unique drug **Ingredients** went up from **503 to 551**, which suggests new products or formulations. We also saw gradual increases in the uniqueness of **Strength**, **Dosage**, and **Package Size**, indicating more options and variety. However, the **MDR Unit Type** stayed relatively stable, with only a small change in its values.

In [29]:
# List of AMP datasets
amp_datasets = [
    (amp_2018, "AMP 2018"),
    (amp_2019, "AMP 2019"),
    (amp_2020, "AMP 2020"),
    (amp_2021, "AMP 2021"),
    (amp_2022, "AMP 2022"),
]

## Data Cleaning

### Missing values

In [30]:
def summarize_missing_data(df):
    # Create and print a summary dataframe
    print(
        pd.DataFrame(
            {
                "Column Name": df.columns,
                "Null or NaN Values": df.isnull().sum(),
                "Blank Spaces": (df == "").sum(),
            }
        ).to_string(index=False)
    )


# Loop through amp_datasets and summarize missing data
for df, name in amp_datasets:
    print(f"\n{name} Summary:")
    summarize_missing_data(df)


AMP 2018 Summary:
             Column Name  Null or NaN Values  Blank Spaces
           Product Group                   0             0
              Ingredient                   0             0
                Strength                   0             0
                  Dosage                   0             0
                   Route                   0             0
           MDR Unit Type                   0             0
Weighted Average of AMPs                   0             0
                 ACA FUL                   0             0
            Package Size                   0             0
                     NDC                   0             0
                    Year                   0             0
                   Month                   0             0

AMP 2019 Summary:
             Column Name  Null or NaN Values  Blank Spaces
           Product Group                   0             0
              Ingredient                   0             0
                St

No missing values were found in  the given datasets.

### Data Types

In [31]:
print(amp_2018.dtypes, "\n")
print(amp_2019.dtypes, "\n")
print(amp_2020.dtypes, "\n")
print(amp_2021.dtypes, "\n")
print(amp_2022.dtypes, "\n")

Product Group                 int64
Ingredient                   object
Strength                     object
Dosage                       object
Route                        object
MDR Unit Type                object
Weighted Average of AMPs    float64
ACA FUL                     float64
Package Size                float64
NDC                          object
Year                          int64
Month                         int64
dtype: object 

Product Group                 int64
Ingredient                   object
Strength                     object
Dosage                       object
Route                        object
MDR Unit Type                object
Weighted Average of AMPs    float64
ACA FUL                     float64
Package Size                float64
NDC                          object
Year                          int64
Month                         int64
dtype: object 

Product Group                 int64
Ingredient                   object
Strength                     obj

### Duplicate Rows

In [32]:
print(f"Number of duplicate rows in amp_2018: {amp_2018.duplicated().sum()}")
print(f"Number of duplicate rows in amp_2019: {amp_2019.duplicated().sum()}")
print(f"Number of duplicate rows in amp_2020: {amp_2020.duplicated().sum()}")
print(f"Number of duplicate rows in amp_2021: {amp_2021.duplicated().sum()}")
print(f"Number of duplicate rows in amp_2022: {amp_2022.duplicated().sum()}")

Number of duplicate rows in amp_2018: 0
Number of duplicate rows in amp_2019: 0
Number of duplicate rows in amp_2020: 0
Number of duplicate rows in amp_2021: 0
Number of duplicate rows in amp_2022: 4


In [33]:
# Display duplicate rows in amp_2022
duplicate_rows_amp_2022 = amp_2022[amp_2022.duplicated()]

# Print the duplicate rows
print("Duplicate rows in amp_2022:")
duplicate_rows_amp_2022


Duplicate rows in amp_2022:


Unnamed: 0,Product Group,Ingredient,Strength,Dosage,Route,MDR Unit Type,Weighted Average of AMPs,ACA FUL,Package Size,NDC,Year,Month
46190,2869,CARBAMAZEPINE,400MG,"TABLET, EXTENDED RELEASE",ORAL,TAB,1.375235,2.406661,100.0,71930-0074-12,2022,3
47556,3262,CARBAMAZEPINE,100 MG,TAB ER 12H,ORAL,TAB,0.408808,0.715414,100.0,71930-0072-12,2022,3
47567,3263,CARBAMAZEPINE,200 MG,TAB ER 12H,ORAL,TAB,0.672132,1.176231,100.0,71930-0073-12,2022,3
53094,9154,SULFAMETHOXAZOLE/TRIMETHOPRIM,200-40MG/5,ORAL SUSP,ORAL,ML,0.025975,0.05588,473.0,62559-0550-16,2022,3


These rows are not duplicates because the columns `Year`, `Month`, and `Route` may have the same values across multiple rows. However, each row has a unique `NDC`(National Drug Code), which differentiates them. Even though some values repeat, the dataset still contains distinct records.

### Rename columns

In [34]:
# Clean column names in all datasets permanently
amp_2018 = amp_2018.clean_names()
amp_2019 = amp_2019.clean_names()
amp_2020 = amp_2020.clean_names()
amp_2021 = amp_2021.clean_names()
amp_2022 = amp_2022.clean_names()

# Verify column names after renaming
for df, name in zip(
    [amp_2018, amp_2019, amp_2020, amp_2021, amp_2022],
    ["AMP 2018", "AMP 2019", "AMP 2020", "AMP 2021", "AMP 2022"],
):
    print(f"\n{name} Columns:\n", df.columns)



AMP 2018 Columns:
 Index(['product_group', 'ingredient', 'strength', 'dosage', 'route',
       'mdr_unit_type', 'weighted_average_of_amps', 'aca_ful', 'package_size',
       'ndc', 'year', 'month'],
      dtype='object')

AMP 2019 Columns:
 Index(['product_group', 'ingredient', 'strength', 'dosage', 'route',
       'mdr_unit_type', 'weighted_average_of_amps', 'aca_ful', 'package_size',
       'ndc', 'year', 'month'],
      dtype='object')

AMP 2020 Columns:
 Index(['product_group', 'ingredient', 'strength', 'dosage', 'route',
       'mdr_unit_type', 'weighted_average_of_amps', 'aca_ful', 'package_size',
       'ndc', 'year', 'month'],
      dtype='object')

AMP 2021 Columns:
 Index(['product_group', 'ingredient', 'strength', 'dosage', 'route',
       'mdr_unit_type', 'weighted_average_of_amps', 'aca_ful', 'package_size',
       'ndc', 'year', 'month'],
      dtype='object')

AMP 2022 Columns:
 Index(['product_group', 'ingredient', 'strength', 'dosage', 'route',
       'mdr_unit_type',

### Removing inconsistencies

In [35]:
# Convert all string values to lowercase
amp_2018 = amp_2018.applymap(lambda x: x.lower() if isinstance(x, str) else x)
amp_2019 = amp_2019.applymap(lambda x: x.lower() if isinstance(x, str) else x)
amp_2020 = amp_2020.applymap(lambda x: x.lower() if isinstance(x, str) else x)
amp_2021 = amp_2021.applymap(lambda x: x.lower() if isinstance(x, str) else x)
amp_2022 = amp_2022.applymap(lambda x: x.lower() if isinstance(x, str) else x)

amp_2018.head()

Unnamed: 0,product_group,ingredient,strength,dosage,route,mdr_unit_type,weighted_average_of_amps,aca_ful,package_size,ndc,year,month
0,700,furosemide,80mg,tablet,oral,tab,0.039469,0.069071,100.0,63304-0626-01,2018,12
1,9591,candesartan cilexetil,8mg,tablet,oral,tab,2.489034,4.35581,90.0,49884-0659-09,2018,12
2,2421,lamotrigine,200mg,tablet,oral,tab,0.400614,0.701075,500.0,68382-0010-05,2018,12
3,4219,losartan/hydrochlorothiazide,100mg-25mg,tablet,oral,tab,0.051319,0.089808,90.0,60505-2917-09,2018,12
4,354,clonidine hydrochloride,0.3mg,tablet,oral,tab,0.018478,0.05652,1.0,51079-0301-01,2018,12


## Data Integration

In [36]:
# List of datasets
datasets = [amp_2018, amp_2019, amp_2020, amp_2021, amp_2022]

# Required columns that must be present in all datasets
required_columns = [
    "ndc",
    "product_group",
    "ingredient",
    "dosage",
    "route",
    "mdr_unit_type",
    "package_size",
]

# Check if all datasets contain the required columns
if all(set(required_columns).issubset(df.columns) for df in datasets):
    # Filter out rows with missing values in required columns
    datasets_filtered = [df.dropna(subset=required_columns) for df in datasets]

    # Merge all datasets into a single DataFrame
    amp_dataset = pd.concat(datasets_filtered, ignore_index=True)

    # Verify the shape of the final DataFrame
    print("Final merged dataset shape:", amp_dataset.shape)


Final merged dataset shape: (1057557, 12)


## Final Dataset

In [None]:
# Display the first few rows
amp_dataset.head(10)

Unnamed: 0,product_group,ingredient,strength,dosage,route,mdr_unit_type,weighted_average_of_amps,aca_ful,package_size,ndc,year,month
0,700,furosemide,80mg,tablet,oral,tab,0.039469,0.069071,100.0,63304-0626-01,2018,12
1,9591,candesartan cilexetil,8mg,tablet,oral,tab,2.489034,4.35581,90.0,49884-0659-09,2018,12
2,2421,lamotrigine,200mg,tablet,oral,tab,0.400614,0.701075,500.0,68382-0010-05,2018,12
3,4219,losartan/hydrochlorothiazide,100mg-25mg,tablet,oral,tab,0.051319,0.089808,90.0,60505-2917-09,2018,12
4,354,clonidine hydrochloride,0.3mg,tablet,oral,tab,0.018478,0.05652,1.0,51079-0301-01,2018,12
5,908,labetalol hydrochloride,100mg,tablet,oral,tab,0.061269,0.17316,100.0,43199-0037-01,2018,12
6,3944,hydrocodone bit/acetaminophen,5mg-325mg,tablet,oral,tab,0.066764,0.116837,100.0,00406-0123-01,2018,12
7,8584,vancomycin hydrochloride,250mg,capsule,oral,cap,2.714588,5.64538,50.0,62559-0391-50,2018,12
8,1533,verapamil hydrochloride,240mg,"tablet, extended release",oral,tab,0.14306,0.250355,100.0,68462-0260-01,2018,12
9,1563,amoxicillin,500mg,tablet,oral,tab,0.094358,0.165127,100.0,65862-0014-01,2018,12


In [None]:
# Display the last few rows
amp_dataset.tail(10)

Unnamed: 0,product_group,ingredient,strength,dosage,route,mdr_unit_type,weighted_average_of_amps,aca_ful,package_size,ndc,year,month
1057547,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,30.0,00472-1790-30,2022,12
1057548,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,60.0,00472-1790-60,2022,12
1057549,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,30.0,66993-0959-31,2022,12
1057550,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,60.0,66993-0959-61,2022,12
1057551,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,30.0,68682-0300-30,2022,12
1057552,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,60.0,68682-0300-60,2022,12
1057553,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,30.0,73473-0306-30,2022,12
1057554,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,60.0,73473-0306-60,2022,12
1057555,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,30.0,99207-0300-30,2022,12
1057556,13927,clindamycin/tretinoin,1.2-0.025%,gel (gram),topical,gm,5.260167,9.205292,60.0,99207-0300-60,2022,12


In [39]:
amp_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057557 entries, 0 to 1057556
Data columns (total 12 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   product_group             1057557 non-null  int64  
 1   ingredient                1057557 non-null  object 
 2   strength                  1057557 non-null  object 
 3   dosage                    1057557 non-null  object 
 4   route                     1057557 non-null  object 
 5   mdr_unit_type             1057557 non-null  object 
 6   weighted_average_of_amps  1057557 non-null  float64
 7   aca_ful                   1057557 non-null  float64
 8   package_size              1057557 non-null  float64
 9   ndc                       1057557 non-null  object 
 10  year                      1057557 non-null  int64  
 11  month                     1057557 non-null  int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 96.8+ MB


In [40]:
amp_dataset.describe()

Unnamed: 0,product_group,weighted_average_of_amps,aca_ful,package_size,year,month
count,1057557.0,1057557.0,1057557.0,1057557.0,1057557.0,1057557.0
mean,3376.895,0.7538592,1.468958,225.8019,2020.028,6.505203
std,2765.578,3.907196,7.193195,605.9504,1.417847,3.455777
min,1.0,8e-05,0.002515,1.0,2018.0,1.0
25%,1611.0,0.049794,0.10463,30.0,2019.0,4.0
50%,2490.0,0.113698,0.24577,100.0,2020.0,7.0
75%,4569.0,0.361778,0.75634,100.0,2021.0,10.0
max,13927.0,278.122,486.7135,28000.0,2022.0,12.0


In [41]:
amp_dataset.nunique()

Unnamed: 0,0
product_group,1725
ingredient,609
strength,509
dosage,92
route,14
mdr_unit_type,7
weighted_average_of_amps,71849
aca_ful,66200
package_size,131
ndc,66204


In [42]:
amp_dataset.describe(include="object")

Unnamed: 0,ingredient,strength,dosage,route,mdr_unit_type,ndc
count,1057557,1057557,1057557,1057557,1057557,1057557
unique,609,509,92,14,7,66204
top,gabapentin,10mg,tablet,oral,tab,49884-0659-09
freq,14490,78309,705944,1016674,824161,48


In [None]:
# Define the path to save the CSV file
repo_path = r"C:\Users\shiri\OneDrive\Documents\Python\ml-projects\data-606\pharmaceutical-drug-price-prediction"
save_path = os.path.join(repo_path, "data", "interim")

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Save the dataset as a CSV file
csv_file_path = os.path.join(save_path, "amp_dataset.csv")
amp_dataset.to_csv(csv_file_path, index=False)

print(f"Dataset saved successfully at: {csv_file_path}")

Dataset saved successfully at: /content/drive/My Drive/DATA_606/data/amp_dataset.csv
