# 1- Filter and Save Datasets Based on Selected Columns

In [None]:
import pandas as pd
import gc

# Columns you want to keep
use_cols = ['Prscrbr_NPI', 'Prscrbr_Type', 'Tot_Benes', 'Tot_Clms',
            'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst']

# Memory-efficient dtypes
dtype_map = {
    'Prscrbr_NPI': 'str',
    'Prscrbr_Type': 'category',
    'Tot_Benes': 'float32',
    'Tot_Clms': 'float32',
    'Tot_30day_Fills': 'float32',
    'Tot_Day_Suply': 'float32',
    'Tot_Drug_Cst': 'float32',
}

for year in range(2017, 2020):
    file_name = f"Medicare_Part_D_Prescribers_by_Provider_and_Drug_{year}.csv"
    output_name = f"outputs/Filtered_Medicare_{year}.csv"

    try:
        print(f"Processing {file_name}")
        df = pd.read_csv(file_name, usecols=use_cols, dtype=dtype_map)
        df.to_csv(output_name, index=False)
        print(f"Saved filtered file: {output_name}")
        del df
        gc.collect()
    except Exception as e:
        print(f"Failed to process {file_name}: {e}")




# 2- Combine Datasets for Different Years

In [None]:
import pandas as pd
import gc

# List of columns (same for all files)
use_cols = ['Prscrbr_NPI', 'Prscrbr_Type', 'Tot_Benes', 'Tot_Clms',
            'Tot_30day_Fills', 'Tot_Day_Suply', 'Tot_Drug_Cst']

# Function to combine and save a list of filtered CSVs
def combine_and_save(years, output_filename):
    dfs = []
    for year in years:
        file_name = f"outputs/Filtered_Medicare_{year}.csv"
        try:
            print(f"Loading {file_name}")
            # df = pd.read_csv(file_name, usecols=use_cols)
            df = pd.read_csv(file_name)
            df['Source_Year'] = year  # Optional: add year column
            dfs.append(df)
        except Exception as e:
            print(f"Failed to load {file_name}: {e}")
    # Concatenate and save
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df.to_csv(output_filename, index=False)
        print(f"Saved combined file: {output_filename}")
        del combined_df, dfs
        gc.collect()
    else:
        print(f"No data to save for {output_filename}")

# Combine and save groups
# combine_and_save([2013, 2014, 2015, 2016], "outputs/Combined_Medicare_2013_2016.csv")
combine_and_save([2017, 2018, 2019], "outputs/Combined_Medicare_2017_2019.csv")
