In [None]:
!pip install requests
!pip install pandas
!pip install numpy
!pip install seaborn 
!pip install zipfile

In [3]:
import requests
import pandas as pd
import seaborn as sns 
import numpy as np
import zipfile as zp

In [4]:
#DO NOT RUN - FOR STORAGE PURPOSES 

# Path to the original zip file
original_zip_path = 'data/NADAC-2024.csv.zip'

# Extract the CSV file from the zip archive and load it into a DataFrame
with zp.ZipFile(original_zip_path, 'r') as zipf:
    with zipf.open('NADAC-2024.csv') as file:
        df_nadac = pd.read_csv(file)

# Split the DataFrame into two halves
df_nadac_part1 = df_nadac.iloc[:len(df_nadac) // 2]
df_nadac_part2 = df_nadac.iloc[len(df_nadac) // 2:]

# Save each part to a separate zip file
with zp.ZipFile('data/NADAC-2024_part1.csv.zip', 'w') as zipf:
    zipf.writestr('NADAC-2024_part1.csv', df_nadac_part1.to_csv(index=False))

with zp.ZipFile('data/NADAC-2024_part2.csv.zip', 'w') as zipf:
    zipf.writestr('NADAC-2024_part2.csv', df_nadac_part2.to_csv(index=False))

print("Split and saved NADAC-2024.csv into two zip files: NADAC-2024_part1.csv.zip and NADAC-2024_part2.csv.zip")


Split and saved NADAC-2024.csv into two zip files: NADAC-2024_part1.csv.zip and NADAC-2024_part2.csv.zip


READING IN DATASETS 

In [None]:
import pandas as pd
import zipfile as zp

# List of zip files, including the split NADAC files
zip_files = [
    'data/CMSspending.csv.zip',
    'data/fda_directory.xlsx.zip',
    'data/NADAC-2024_part1.csv.zip',
    'data/NADAC-2024_part2.csv.zip',
]

# Create an empty dictionary to store each DataFrame
dataframes = {}

# Temporary list to hold NADAC parts
nadac_parts = []

# Loop through each zip file and extract the content
for zip_path in zip_files:
    with zp.ZipFile(zip_path, 'r') as zipf:
        # Get the name of the first file inside the zip (assuming only one file per zip)
        file_name = zipf.namelist()[0]
        
        # Extract the file and load it into a DataFrame based on its file extension
        with zipf.open(file_name) as file:
            if file_name.endswith('.csv'):
                # Load CSV file into DataFrame
                df = pd.read_csv(file)
                # If the file is a part of NADAC, add it to the nadac_parts list
                if 'NADAC-2024_part' in file_name:
                    nadac_parts.append(df)
                else:
                    # Store other CSV files normally
                    base_name = file_name.split('.')[0]
                    dataframes[base_name] = df
            elif file_name.endswith('.xlsx'):
                # Load Excel file into DataFrame
                df = pd.read_excel(file, engine='openpyxl')
                # Store the Excel file normally
                base_name = file_name.split('.')[0]
                dataframes[base_name] = df
            else:
                raise ValueError(f"Unsupported file format: {file_name}")

# Concatenate the NADAC parts into a single DataFrame and add it to the dictionary
if nadac_parts:
    dataframes['NADAC-2024'] = pd.concat(nadac_parts, ignore_index=True)
    print("Merged NADAC parts into a single DataFrame")

# Display the first few rows of each DataFrame for verification
for name, df in dataframes.items():
    print(f"\nDataFrame '{name}':")
    print(df.head())


In [9]:
#Create dataframes for each dataset 

df_directory = dataframes["fda_directory"]
df_spending = dataframes["CMSspending"]
df_pricing = dataframes["NADAC-2024"]
df_purplebook = pd.read_csv("data/purplebook-october-data.csv", header=3)

CLEANING AND MERGING DATA

In [None]:
#Cleaning purplebook and merging with FDA directory

clean_purple = df_purplebook[df_purplebook['Ref. Product Proprietary Name'].notna()]
clean_purple['Num Biosimilars'] = 1
grouped_purple = clean_purple.groupby('Ref. Product Proprietary Name')['Num Biosimilars'].sum().reset_index()

grouped_purple['Ref. Product Proprietary Name'] = (
    grouped_purple['Ref. Product Proprietary Name']
    .str.lower()                 # Convert to lowercase
    .str.replace(' ', '')        # Remove spaces
)


df_directory['PROPRIETARYNAME'] = (
    df_directory['PROPRIETARYNAME']
    .str.lower()                 # Convert to lowercase
    .str.replace(' ', '')        # Remove spaces
)

directory_purplebook = pd.merge(df_directory, grouped_purple, left_on='PROPRIETARYNAME', right_on="Ref. Product Proprietary Name", how='left')
directory_purplebook

EXTRANEOUS

In [3]:
url = "https://data.medicaid.gov/api/1/datastore/query/99315a95-37ac-4eee-946a-3c523b4c481e/0"

In [None]:
database = []
offset = 0
while True:
    response = requests.get(url, params={'limit': 5000, 'offest': offset, 'format': 'json'})
    data = response.json().get('results', [])
    if not data:
        break
    print('iteration')
    database.extend(data)
    offset += 5000

medicaid_df = pd.DataFrame(database)