In [57]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

from pyspark.sql.functions import col, sum, round, countDistinct, max, variance

from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.clustering import KMeans



In [58]:
# Initialize Spark Session
spark = SparkSession.builder.appName("HealthCareModel").getOrCreate()

In [59]:
part_d_data_path = "../DataSet/Prescribers - by Provider and Drug/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv"
df = spark.read.csv(part_d_data_path, header=True, inferSchema=True)



                                                                                

In [60]:
df.printSchema()


root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: integer (nullable = true)
 |-- GE65_Bene_Sprsn_Flag: string (nullable = true)
 |-- GE65_T

In [61]:
part_d_data_t = df.select(
    col("Prscrbr_NPI").alias("npi"),
    col("Prscrbr_City").alias("city"),
    col("Prscrbr_State_Abrvtn").alias("state"),
    col("Prscrbr_Last_Org_Name").alias("last_name"),
    col("Prscrbr_First_Name").alias("first_name"),
    col("Prscrbr_Type").alias("specialty"),
    col("Brnd_Name").alias("drug_name"),
    col("Gnrc_Name").alias("generic_name"),
    col("Tot_Drug_Cst").alias("total_drug_cost"),
    col("Tot_Clms").alias("total_claim_count"),
    col("Tot_Day_Suply").alias("total_day_supply")
)

In [63]:
# Step 1: Assign one DataFrame to another
part_d_pd1 = part_d_data_t

# Step 2: Select specific columns
part_d_drug_df = part_d_data_t.select("npi", "drug_name", "total_drug_cost", "total_claim_count", "total_day_supply", "specialty")

# Step 3: Change the data type of 'npi' to StringType
part_d_drug_df = part_d_drug_df.withColumn("npi", col("npi").cast(StringType()))

# Step 4: Select specific columns from another DataFrame
part_d_spec_df1 = part_d_data_t.select("npi", "specialty")

# Step 5: Show the first few rows (equivalent to head(0) in Pandas)
part_d_spec_df1.show()
part_d_drug_df.show()

+----------+-----------------+
|       npi|        specialty|
+----------+-----------------+
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
+----------+-----------------+
only showing top 20 rows

+----------+--------------------+---------------+-----------------+----------------+-----------------+
|       npi|           drug_name|total_drug_cost|total_claim_count|total_day_supply|        specialty|
+----------+------------

In [64]:
part_d_pd2 = part_d_data_t.select('npi',
                                  'city',
                                  'state',
                                  'last_name',
                                  'first_name',
                                  'specialty')

part_d_pd2.show()

+----------+--------+-----+---------+----------+-----------------+
|       npi|    city|state|last_name|first_name|        specialty|
+----------+--------+-----+---------+----------+-----------------+
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medic

In [65]:
part_d_pd_u = part_d_pd2.dropDuplicates()

# Show the result
part_d_pd_u.show()

[Stage 34:>                                                         (0 + 1) / 1]

+----------+-------------+-----+---------+----------+-------------------+
|       npi|         city|state|last_name|first_name|          specialty|
+----------+-------------+-----+---------+----------+-------------------+
|1003028002|      Durango|   CO|   Haynes|      Kent|            Dentist|
|1003041476|     Oak Park|   IL| Lindgren|     Kevin|Allergy/ Immunology|
|1003043555|   Washington|   PA|  Orlosky|     Julie|    Family Practice|
|1003046939|      Spencer|   IA|  Heckert|     Kathi| Nurse Practitioner|
|1003055781|     Hartford|   CT|     Rice|     Jenny|Physician Assistant|
|1003064825|        Parma|   OH| Phillips|    Cherie|   Vascular Surgery|
|1003091539|    Fullerton|   CA|      Woo|      Kiho|  Internal Medicine|
|1003095167|      El Paso|   IL|    Tyner|Jean-Marie| Nurse Practitioner|
|1003101932|   Cumberland|   MD|     Hong|     Feiyu|  Internal Medicine|
|1003119009|     Bellevue|   WA|  Swenson|   Jessica| Nurse Practitioner|
|1003127002|       Laredo|   TX|  Rami

                                                                                

In [66]:
group_cols = ['npi']

part_d_pd3 = (part_d_pd1.groupBy(group_cols)
             .agg(
                 F.sum("total_drug_cost").alias("sum_total_drug_cost"),
                 F.mean("total_drug_cost").alias("mean_total_drug_cost"),
                 F.max("total_drug_cost").alias("max_total_drug_cost"),
                 F.sum("total_claim_count").alias("sum_total_claim_count"),
                 F.mean("total_claim_count").alias("mean_total_claim_count"),
                 F.max("total_claim_count").alias("max_total_claim_count"),
                 F.sum("total_day_supply").alias("sum_total_day_supply"),
                 F.mean("total_day_supply").alias("mean_total_day_supply"),
                 F.max("total_day_supply").alias("max_total_day_supply")
             ))

# Cast to float if necessary (optional)
for col_name in part_d_pd3.columns:
    part_d_pd3 = part_d_pd3.withColumn(col_name, F.col(col_name).cast("float"))

# Show the result
part_d_pd3.show()



+------------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+
|         npi|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|
+------------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+
| 1.0030432E9|           18613.12|             930.656|            7077.18|                355.0|                 17.75|                 44.0|             21040.0|               1052.0|              3000.0|
|1.00307283E9|          282157.47|           3399.4878|           45438.26|               4100.0|              49.39759|                244.0|            203448.0|         

                                                                                

In [67]:
part_d_pd3.count()

                                                                                

1017417

In [69]:
part_d_all_pd = part_d_pd3.join(part_d_pd_u, on='npi', how='left')

# Show the result
part_d_all_pd.show()

[Stage 49:>                                                         (0 + 4) / 4]

+------------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+--------------+-----+--------------+----------+--------------------+
|         npi|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|          city|state|     last_name|first_name|           specialty|
+------------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+--------------+-----+--------------+----------+--------------------+
| 1.0030124E9|             577.08|              288.54|             373.53|                 70.0|                  35.0|                 39.0|               478.0|            

                                                                                

In [None]:
payment_data_set="../DataSet/PaymentDataSet/OP_DTL_OWNRSHP_PGYR2021_P06302023.csv"