In [42]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CMS-Refactor").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark

In [43]:
phase_1_path = "phase_1.csv"
dac_path = "../CMS-Classic/Data/Public/DAC_NationalDownloadableFile.csv"

phase_1 = spark.read.csv(
    phase_1_path,
    header=True,
    inferSchema=True
)

dac = spark.read.csv(
    dac_path,
    header=True,
    inferSchema=True
)

dac = dac.select([
    "NPI",
    "Facility Name",
    "org_pac_id"
])

                                                                                

In [44]:
from pyspark.sql.functions import *

dac_dropped = dac.dropna()
dac_dropped = dac_dropped.dropDuplicates()

# Convert org_pac_id to 'int'
dac_dropped = dac_dropped.withColumn(
    "org_pac_id",
    floor(dac_dropped.org_pac_id)
)

grouped_df = dac_dropped.groupBy('NPI').agg(
    collect_list("Facility Name").alias("Facility_Names"),
    collect_list("org_pac_id").alias("org_pac_ids")
)

# max_length = grouped_df.selectExpr("size(Facility_Names) as size").rdd.max()[0]
max_length = 4

for i in range(max_length):
    grouped_df = grouped_df.withColumn(f"Facility_Name_{i+1}", expr(f"Facility_Names[{i}]")).withColumn(f"Org_pac_id_{i+1}", expr(f"org_pac_ids[{i}]"))

grouped_df = grouped_df.drop("Facility_Names", "org_pac_ids")

In [45]:
merged_inner = phase_1.join(
    grouped_df,
    "NPI",
    "inner"
)
merged_inner.count()

                                                                                

8915

In [46]:
merged_left = phase_1.join(
    grouped_df,
    "NPI",
    "left"
)
merged_left.count()

10196

In [47]:
# Rearrange the columns
columns = phase_1.columns[:27]
g_cols = grouped_df.columns
g_cols.remove('NPI')
columns.extend(g_cols)
columns.extend(phase_1.columns[27:])

merged_inner = merged_inner.select(columns)
merged_left = merged_left.select(columns)

In [48]:
merged_inner.toPandas().to_csv("outputs/cms_dac_inner.csv", index=False)
merged_left.toPandas().to_csv("outputs/cms_dac_left.csv", index=False)

                                                                                

In [49]:
grouped_df_c = dac_dropped.groupBy('NPI').agg(
    collect_list("Facility Name").alias("Facility_Names"),
    collect_list("org_pac_id").alias("org_pac_ids")
)

max_length = grouped_df_c.selectExpr("size(Facility_Names) as size")# .rdd.max()

In [50]:
frequency = max_length.groupBy('size').count()

In [51]:
import pandas as pd

frequency_df = frequency.toPandas()

                                                                                

In [52]:
frequency_df = frequency_df.sort_values(by='count', ascending=False).reset_index(drop=True)
frequency_df.columns = ['Facility Count', 'Physician Count']
frequency_df.to_csv('Facility_frequency.csv', index=False)

In [53]:
weighted_avg = (frequency_df['Facility Count'] * frequency_df['Physician Count']).sum() / frequency_df['Physician Count'].sum()
weighted_avg

np.float64(1.1802600221990076)

In [54]:
frequency_df[frequency_df['Physician Count'] == frequency_df['Physician Count'].max()]

Unnamed: 0,Facility Count,Physician Count
0,1,1060138


In [55]:
merged_inner.columns

['NPI',
 'Provider Last Name',
 'Provider First Name',
 'Provider Middle Name',
 'Provider First Line Business Mailing Address',
 'Provider Business Mailing Address City Name',
 'Provider Business Mailing Address State Name',
 'Provider Business Mailing Address Postal Code',
 'Primary_Classification',
 'Primary_Specialization',
 'Definition10',
 'Notes11',
 'Display Name12',
 'Section13',
 'Secondary_Classification',
 'Secondary_Specialization',
 'Definition16',
 'Notes17',
 'Display Name18',
 'Section19',
 'Tertiary_Classification',
 'Tertiary_Specialization',
 'Definition22',
 'Notes23',
 'Display Name24',
 'Section25',
 'RINVOQ_CONSULTING',
 'Facility_Name_1',
 'Org_pac_id_1',
 'Facility_Name_2',
 'Org_pac_id_2',
 'Facility_Name_3',
 'Org_pac_id_3',
 'Facility_Name_4',
 'Org_pac_id_4',
 'RINVOQ_FOOD&BEVERAGE',
 'RINVOQ_OTHERS_GENERAL',
 'RINVOQ_EDUCATION',
 'RINVOQ_SPEAKER',
 'RINVOQ_TRAVEL',
 'SKYRIZI_CONSULTING',
 'SKYRIZI_FOOD&BEVERAGE',
 'SKYRIZI_OTHERS_GENERAL',
 'SKYRIZI_EDUCA