In [1]:
import numpy as np
import os
import pandas as pd
import sqlalchemy as sa

from pandas.io.sql import SQLTable

def _execute_insert(self, conn, keys, data_iter):
    """Optional, but useful: helps Pandas write tables against Postgres much faster.
    See https://github.com/pydata/pandas/issues/8953 for more info
    """
    print("Using monkey-patched _execute_insert")
    data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
    conn.execute(self.insert_statement().values(data))

SQLTable._execute_insert = _execute_insert

OP_DWH = os.getenv('OP_DWH')
engine = sa.create_engine(OP_DWH)

## Importing Data Open Payments

In [2]:
df_2013 = pd.read_csv('../data/open_payments/General_Payment_Data___Detailed_Dataset_2013_Reporting_Year.csv', dtype=str)
df_2014 = pd.read_csv('../data/open_payments/General_Payment_Data___Detailed_Dataset_2014_Reporting_Year.csv', dtype=str)
df_2015 = pd.read_csv('../data/open_payments/General_Payment_Data___Detailed_Dataset_2015_Reporting_Year.csv', dtype=str)
df_2016 = pd.read_csv('../data/open_payments/General_Payment_Data___Detailed_Dataset_2016_Reporting_Year.csv', dtype=str)

In [3]:
df = pd.concat([df_2013, df_2014, df_2015, df_2016], sort=False)

In [4]:
df.head()

Unnamed: 0,Change_Type,Covered_Recipient_Type,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Physician_Name_Suffix,...,Covered_or_Noncovered_Indicator_4,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4,Product_Category_or_Therapeutic_Area_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Associated_Drug_or_Biological_NDC_4,Covered_or_Noncovered_Indicator_5,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5,Product_Category_or_Therapeutic_Area_5,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,Associated_Drug_or_Biological_NDC_5
0,UNCHANGED,Covered Recipient Physician,,,,174740,LISA,,CAPALDINI,,...,,,,,,,,,,
1,UNCHANGED,Covered Recipient Physician,,,,154459,UTKU,,KANDEMIR,,...,,,,,,,,,,
2,UNCHANGED,Covered Recipient Physician,,,,820445,CAROL,K,LEE,,...,,,,,,,,,,
3,UNCHANGED,Covered Recipient Physician,,,,801,PAUL,,HEIM,,...,,,,,,,,,,
4,UNCHANGED,Covered Recipient Physician,,,,338981,BRUCE,A.,CREE,,...,,,,,,,,,,


In [5]:
df['Total_Amount_of_Payment_USDollars'] = pd.to_numeric(df['Total_Amount_of_Payment_USDollars'])
df['Number_of_Payments_Included_in_Total_Amount'] = pd.to_numeric(df['Number_of_Payments_Included_in_Total_Amount'])
df['Program_Year'] = pd.to_numeric(df['Program_Year'])
df['Date_of_Payment'] = pd.to_datetime(df['Date_of_Payment'])
df['Payment_Publication_Date'] = pd.to_datetime(df['Payment_Publication_Date'])

In [6]:
df.head()

Unnamed: 0,Change_Type,Covered_Recipient_Type,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Physician_Name_Suffix,...,Covered_or_Noncovered_Indicator_4,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4,Product_Category_or_Therapeutic_Area_4,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4,Associated_Drug_or_Biological_NDC_4,Covered_or_Noncovered_Indicator_5,Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5,Product_Category_or_Therapeutic_Area_5,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5,Associated_Drug_or_Biological_NDC_5
0,UNCHANGED,Covered Recipient Physician,,,,174740,LISA,,CAPALDINI,,...,,,,,,,,,,
1,UNCHANGED,Covered Recipient Physician,,,,154459,UTKU,,KANDEMIR,,...,,,,,,,,,,
2,UNCHANGED,Covered Recipient Physician,,,,820445,CAROL,K,LEE,,...,,,,,,,,,,
3,UNCHANGED,Covered Recipient Physician,,,,801,PAUL,,HEIM,,...,,,,,,,,,,
4,UNCHANGED,Covered Recipient Physician,,,,338981,BRUCE,A.,CREE,,...,,,,,,,,,,


In [7]:
df.columns = df.columns.str.lower()

In [8]:
df['full_address'] = df['recipient_primary_business_street_address_line1'].fillna('') \
  + ' ' + df['recipient_primary_business_street_address_line2'].fillna('') \
  + ', ' + df['recipient_city'].fillna('') \
  + ', ' + df['recipient_state'].fillna('') \
  + ' ' + df['recipient_zip_code'].fillna('')

df['full_address'].replace({' ,': ','}, inplace=True, regex=True)

In [9]:
df['physician_full_name'] = df['physician_first_name'].fillna('')\
  + ' ' + df['physician_middle_name'] \
  + ' ' + df['physician_last_name'] \
  + ' ' + df['physician_name_suffix']
df['physician_full_name'].replace({'  ': ' '}, inplace=True, regex=True)

In [10]:
df.to_csv('../data/open_payments/payment_data_all_years.csv', index=False)

In [11]:
df.shape

(153199, 93)

In [None]:
with engine.begin() as conn:
    df.to_sql(con=conn, schema='data_ingest', name='open_payments_data_all_years', if_exists='replace', index=False, chunksize=1000)

Using monkey-patched _execute_insert


Alternatively, we could push the above to a Postgres DB

In [None]:
df_grouped = df.groupby([
    'physician_first_name',
    'physician_middle_name',
    'physician_last_name',
    'physician_name_suffix',
    'physician_full_name',
    'full_address',
]).agg({'total_amount_of_payment_usdollars': 'sum'})

In [None]:
df_grouped.reset_index(inplace=True)

In [None]:
sz = df_grouped['total_amount_of_payment_usdollars'].size-1
df_grouped['pcnt_total_amount_of_payment_us_dollars'] = \
  df_grouped['total_amount_of_payment_usdollars'].rank(method='max').apply(lambda x: 100.0*(x-1)/sz)

In [None]:
df_grouped['quantile_total_amount_of_payment_us_dollars'] = df_grouped['pcnt_total_amount_of_payment_us_dollars'] / 20

In [None]:
df_grouped['quantile_total_amount_of_payment_us_dollars'] =\
  df_grouped['quantile_total_amount_of_payment_us_dollars'].apply(np.ceil)

In [None]:
df_grouped.to_csv('../output/open_payments_grouped_by_physician.csv', index=False)

## Importing Medicare Prescription Data

In [None]:
df_2013 = pd.read_csv('../data/medicare_partd_prescriptions/Medicare_Provider_Utilization_and_Payment_Data__2013_Part_D_Prescriber.csv', dtype=str)
df_2014 = pd.read_csv('../data/medicare_partd_prescriptions/Medicare_Provider_Utilization_and_Payment_Data__2014_Part_D_Prescriber.csv', dtype=str)
df_2015 = pd.read_csv('../data/medicare_partd_prescriptions/Medicare_Provider_Utilization_and_Payment_Data__2015_Part_D_Prescriber.csv', dtype=str)
df_2016 = pd.read_csv('../data/medicare_partd_prescriptions/Medicare_Provider_Utilization_and_Payment_Data__2016_Part_D_Prescriber.csv', dtype=str)

In [None]:
df = pd.concat([df_2013, df_2014, df_2015, df_2016], sort=False)

In [None]:
df['bene_count'] = pd.to_numeric(df['bene_count'])
df['total_claim_count'] = pd.to_numeric(df['total_claim_count'])
df['total_30_day_fill_count'] = pd.to_numeric(df['total_30_day_fill_count'])
df['total_day_supply'] = pd.to_numeric(df['total_day_supply'])
df['total_drug_cost'] = pd.to_numeric(df['total_drug_cost'])
df['bene_count_ge65'] = pd.to_numeric(df['bene_count_ge65'])
df['total_claim_count_ge65'] = pd.to_numeric(df['total_claim_count_ge65'])
df['total_30_day_fill_count_ge65'] = pd.to_numeric(df['total_30_day_fill_count_ge65'])
df['total_day_supply_ge65'] = pd.to_numeric(df['total_day_supply_ge65'])
df['total_drug_cost_ge65'] = pd.to_numeric(df['total_drug_cost_ge65'])

In [None]:
df.head()

In [None]:
df.to_csv('../data/medicare_partd_prescriptions/part_d_prescriber_all_years.csv', index=False)

In [None]:
with engine.begin() as conn:
    df.to_sql(con=conn, name='medicare_part_d_prescriber_all_years', if_exists='replace')