In [71]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CMS-Refactor").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark

In [90]:
phase_1_path = "outputs/final_out.csv"
dac_path = "../CMS-Classic/Data/Public/DAC_NationalDownloadableFile.csv"

phase_1 = spark.read.csv(
    phase_1_path,
    header=True,
    inferSchema=True
)

dac = spark.read.csv(
    dac_path,
    header=True,
    inferSchema=True
)

dac = dac.select([
    "NPI",
    "Facility Name",
    "org_pac_id"
])

                                                                                

In [91]:
from pyspark.sql.functions import *

dac_dropped = dac.dropna()
dac_dropped = dac.dropDuplicates()

# Convert org_pac_id to 'int'
dac_dropped = dac_dropped.withColumn(
    "org_pac_id",
    floor(dac_dropped.org_pac_id)
)

grouped_df = dac_dropped.groupBy('NPI').agg(
    collect_list("Facility Name").alias("Facility_Names"),
    collect_list("org_pac_id").alias("org_pac_ids")
)

max_length = grouped_df.selectExpr("size(Facility_Names) as size").rdd.max()[0]

for i in range(max_length):
    grouped_df = grouped_df.withColumn(f"Facility_Name_{i+1}", expr(f"Facility_Names[{i}]")).withColumn(f"Org_pac_id_{i+1}", expr(f"org_pac_ids[{i}]"))

grouped_df = grouped_df.drop("Facility_Names", "org_pac_ids")

                                                                                

In [92]:
merged_inner = phase_1.join(
    grouped_df,
    "NPI",
    "inner"
)
merged_inner.count()

                                                                                

9439

In [93]:
merged_left = phase_1.join(
    grouped_df,
    "NPI",
    "left"
)
merged_left.count()

10196

In [94]:
# Rearrange the columns
columns = phase_1.columns[:27]
columns.remove('_c0')
g_cols = grouped_df.columns
g_cols.remove('NPI')
columns.extend(g_cols)
columns.extend(phase_1.columns[27:])

merged_inner = merged_inner.select(columns)
merged_left = merged_left.select(columns)

In [95]:
merged_inner.toPandas().to_csv("outputs/cms_dac_inner.csv")
merged_left.toPandas().to_csv("outputs/cms_dac_left.csv")

                                                                                