In [56]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, sum, round, countDistinct, max, variance, log10, when
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import lit
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from scipy.stats import ttest_ind


In [57]:
# Initialize Spark Session
spark = SparkSession.builder.appName("HealthCareModel").getOrCreate()
target = 'is_fraud'

In [58]:
part_d_data_path = "../DataSet/PrescribersByProviderDrug/Medicare_Part_D_Prescribers_by_Provider_and_Drug_2021.csv"
df = spark.read.csv(part_d_data_path, header=True, inferSchema=True)



                                                                                

In [59]:
df.printSchema()


root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: integer (nullable = true)
 |-- GE65_Bene_Sprsn_Flag: string (nullable = true)
 |-- GE65_T

In [60]:
part_d_data_t = df.select(
    col("Prscrbr_NPI").alias("npi"),
    col("Prscrbr_City").alias("city"),
    col("Prscrbr_State_Abrvtn").alias("state"),
    col("Prscrbr_Last_Org_Name").alias("last_name"),
    col("Prscrbr_First_Name").alias("first_name"),
    col("Prscrbr_Type").alias("specialty"),
    col("Brnd_Name").alias("drug_name"),
    col("Gnrc_Name").alias("generic_name"),
    col("Tot_Drug_Cst").alias("total_drug_cost"),
    col("Tot_Clms").alias("total_claim_count"),
    col("Tot_Day_Suply").alias("total_day_supply")
)

In [61]:
part_d_pd1 = part_d_data_t

In [62]:
part_d_drug_df = part_d_data_t.select("npi", "drug_name", "total_drug_cost", "total_claim_count", "total_day_supply", "specialty")

#Change the data type of 'npi' to StringType
part_d_drug_df = part_d_drug_df.withColumn("npi", col("npi").cast(StringType()))
part_d_drug_df.show()


+----------+--------------------+---------------+-----------------+----------------+-----------------+
|       npi|           drug_name|total_drug_cost|total_claim_count|total_day_supply|        specialty|
+----------+--------------------+---------------+-----------------+----------------+-----------------+
|1003000126|  Alendronate Sodium|         125.28|               11|             930|Internal Medicine|
|1003000126| Amlodipine Besylate|         812.86|               64|            5311|Internal Medicine|
|1003000126|            Atenolol|         220.84|               12|            1080|Internal Medicine|
|1003000126|Atorvastatin Calcium|         825.26|               46|            3660|Internal Medicine|
|1003000126|            Cefdinir|          175.5|               11|              89|Internal Medicine|
|1003000126|         Clopidogrel|         128.02|               14|             960|Internal Medicine|
|1003000126|             Eliquis|        7150.87|               15|      

In [63]:
part_d_spec_df1 = part_d_data_t.select("npi", "specialty")
part_d_spec_df1.show()

+----------+-----------------+
|       npi|        specialty|
+----------+-----------------+
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
|1003000126|Internal Medicine|
+----------+-----------------+
only showing top 20 rows



In [64]:
part_d_pd2 = part_d_data_t.select('npi',
                                  'city',
                                  'state',
                                  'last_name',
                                  'first_name',
                                  'specialty')

part_d_pd2.show()

+----------+--------+-----+---------+----------+-----------------+
|       npi|    city|state|last_name|first_name|        specialty|
+----------+--------+-----+---------+----------+-----------------+
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medicine|
|1003000126|Bethesda|   MD|Enkeshafi|   Ardalan|Internal Medic

In [65]:
#Removing Duplicates
part_d_pd_u = part_d_pd2.dropDuplicates()

part_d_pd_u.show()



+----------+-------------+-----+---------+----------+-------------------+
|       npi|         city|state|last_name|first_name|          specialty|
+----------+-------------+-----+---------+----------+-------------------+
|1003028002|      Durango|   CO|   Haynes|      Kent|            Dentist|
|1003041476|     Oak Park|   IL| Lindgren|     Kevin|Allergy/ Immunology|
|1003043555|   Washington|   PA|  Orlosky|     Julie|    Family Practice|
|1003046939|      Spencer|   IA|  Heckert|     Kathi| Nurse Practitioner|
|1003055781|     Hartford|   CT|     Rice|     Jenny|Physician Assistant|
|1003064825|        Parma|   OH| Phillips|    Cherie|   Vascular Surgery|
|1003091539|    Fullerton|   CA|      Woo|      Kiho|  Internal Medicine|
|1003095167|      El Paso|   IL|    Tyner|Jean-Marie| Nurse Practitioner|
|1003101932|   Cumberland|   MD|     Hong|     Feiyu|  Internal Medicine|
|1003119009|     Bellevue|   WA|  Swenson|   Jessica| Nurse Practitioner|
|1003127002|       Laredo|   TX|  Rami

                                                                                

In [66]:
#Group by npi

group_cols = ['npi']

part_d_pd3 = (part_d_pd1.groupBy(group_cols)
             .agg(
                 F.sum("total_drug_cost").alias("sum_total_drug_cost"),
                 F.mean("total_drug_cost").alias("mean_total_drug_cost"),
                 F.max("total_drug_cost").alias("max_total_drug_cost"),
                 F.sum("total_claim_count").alias("sum_total_claim_count"),
                 F.mean("total_claim_count").alias("mean_total_claim_count"),
                 F.max("total_claim_count").alias("max_total_claim_count"),
                 F.sum("total_day_supply").alias("sum_total_day_supply"),
                 F.mean("total_day_supply").alias("mean_total_day_supply"),
                 F.max("total_day_supply").alias("max_total_day_supply")
             ))

part_d_pd3.show()



+----------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+
|       npi|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|
+----------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+
|1003043209| 18613.119999999995|   930.6559999999997|            7077.18|                  355|                 17.75|                   44|               21040|               1052.0|                3000|
|1003072810|          282157.48|  3399.4877108433734|           45438.26|                 4100|    49.397590361445786|                  244|              203448|   2451.18072289156

                                                                                

In [67]:
part_d_pd3.count()

                                                                                

1017417

In [68]:
part_d_all_pd = part_d_pd3.join(part_d_pd_u, on='npi', how='left')
part_d_all_pd.show()



+----------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+-------------+-----+-------------+----------+-------------------+
|       npi|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|         city|state|    last_name|first_name|          specialty|
+----------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+-------------+-----+-------------+----------+-------------------+
|1003043209| 18613.119999999995|   930.6559999999997|            7077.18|                  355|                 17.75|                   44|               21040|               1052.0|       

                                                                                

In [69]:
#PAYMENT DATA

In [70]:
payment_data_path="../DataSet/PaymentDataSet/OP_DTL_OWNRSHP_PGYR2021_P06302023.csv"
pds = spark.read.csv(payment_data_path, header=True, inferSchema=True)

In [71]:
pds.printSchema()

root
 |-- Change_Type: string (nullable = true)
 |-- Physician_Profile_ID: integer (nullable = true)
 |-- Physician_NPI: integer (nullable = true)
 |-- Physician_First_Name: string (nullable = true)
 |-- Physician_Middle_Name: string (nullable = true)
 |-- Physician_Last_Name: string (nullable = true)
 |-- Physician_Name_Suffix: string (nullable = true)
 |-- Recipient_Primary_Business_Street_Address_Line1: string (nullable = true)
 |-- Recipient_Primary_Business_Street_Address_Line2: string (nullable = true)
 |-- Recipient_City: string (nullable = true)
 |-- Recipient_State: string (nullable = true)
 |-- Recipient_Zip_Code: string (nullable = true)
 |-- Recipient_Country: string (nullable = true)
 |-- Recipient_Province: string (nullable = true)
 |-- Recipient_Postal_Code: string (nullable = true)
 |-- Physician_Primary_Type: string (nullable = true)
 |-- Physician_Specialty: string (nullable = true)
 |-- Record_ID: integer (nullable = true)
 |-- Program_Year: integer (nullable = true)

In [72]:
pds.show()

+-----------+--------------------+-------------+--------------------+---------------------+-------------------+---------------------+-----------------------------------------------+-----------------------------------------------+----------------+---------------+------------------+-----------------+------------------+---------------------+----------------------+--------------------+---------+------------+-------------------------------+-----------------+--------------------+---------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+----------------------------------------------------------------+------------------------------+--------------------------------------------------------+------------------------+
|Change_Type|Physician_Profile_ID|Physician_NPI|Physician_First_Name|Physician_Middle_Name|Physician

In [73]:
pds_df = pds.select(
    col("Physician_First_Name").alias("first_name"),
    col("Physician_Last_Name").alias("last_name"),
    col("Recipient_State").alias("state"),
    col("Recipient_City").alias("city"),
    col("Total_Amount_Invested_USDollars").alias("total_amount_of_payment_usd")
)

pds_df.show()

+----------+----------+-----+----------------+---------------------------+
|first_name| last_name|state|            city|total_amount_of_payment_usd|
+----------+----------+-----+----------------+---------------------------+
|     Faith|    Brosch|   DE|          NEWARK|                        0.0|
|      Troy|  Brothers|   FL|          Naples|                        0.0|
|      Anne|     Brown|   VA|        Leesburg|                        0.0|
|     David|     Brown|   FL|      FORT MYERS|                        0.0|
|    Ingrid|     Brown|   TX|      Round Rock|                        0.0|
|     Kevin|     Brown|   TX|      Round Rock|                        0.0|
|    Steven|     Brown|   FL|      Palm Coast|                        0.0|
|   Michael|   Buckley|   NC|         Raleigh|                        0.0|
|  Christin|Richardson|   NC|         Raleigh|                        0.0|
|     Kathy|Richardson|   NC|      Greensboro|                        0.0|
|    Kellyn|    Rielly|  

In [74]:
pds_df1 = (pds_df
           .groupBy('first_name', 'last_name', 'state', 'city')
           .agg(F.sum('total_amount_of_payment_usd').alias('sum_total_amount_of_payment_usd'))
           .withColumn('total_amount_of_payment_usd', F.col('sum_total_amount_of_payment_usd').cast('float')))

pds_df1.show()

+-----------+----------+-----+---------------+-------------------------------+---------------------------+
| first_name| last_name|state|           city|sum_total_amount_of_payment_usd|total_amount_of_payment_usd|
+-----------+----------+-----+---------------+-------------------------------+---------------------------+
|       Mary|   Goodwin|   NC|      Asheville|                            0.0|                        0.0|
|Christopher|      Ames|   CA|  SAN FRANCISCO|                            0.0|                        0.0|
|   Rajeshri|     Patel|   NY|     Long Beach|                        38348.0|                    38348.0|
|     George|    Ferzli|   NY|  Staten Island|                        70000.0|                    70000.0|
|    Michael| Ziebelman|   FL|   Winter Haven|                            0.0|                        0.0|
|    STEPHEN|     RAMEE|   LA|    NEW ORLEANS|                            0.0|                        0.0|
|       JOHN|    VUKICH|   WI|      W

In [75]:
# Convert all string columns to uppercase

for col_name in pds_df1.columns:
    if isinstance(pds_df1.schema[col_name].dataType, StringType):
        pds_df1 = pds_df1.withColumn(col_name, F.upper(F.col(col_name)))

pds_df1.show()

+-----------+----------+-----+---------------+-------------------------------+---------------------------+
| first_name| last_name|state|           city|sum_total_amount_of_payment_usd|total_amount_of_payment_usd|
+-----------+----------+-----+---------------+-------------------------------+---------------------------+
|       MARY|   GOODWIN|   NC|      ASHEVILLE|                            0.0|                        0.0|
|CHRISTOPHER|      AMES|   CA|  SAN FRANCISCO|                            0.0|                        0.0|
|   RAJESHRI|     PATEL|   NY|     LONG BEACH|                        38348.0|                    38348.0|
|     GEORGE|    FERZLI|   NY|  STATEN ISLAND|                        70000.0|                    70000.0|
|    MICHAEL| ZIEBELMAN|   FL|   WINTER HAVEN|                            0.0|                        0.0|
|    STEPHEN|     RAMEE|   LA|    NEW ORLEANS|                            0.0|                        0.0|
|       JOHN|    VUKICH|   WI|      W

In [76]:
#Left join on Part D & Payment

pay_part_d_fpd = part_d_all_pd.join(pds_df1, on=['last_name', 'first_name', 'city', 'state'], how='left')

pay_part_d_fpd.show()

[Stage 145:>                                                        (0 + 4) / 4]

+-------------+----------+-------------+-----+----------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+-------------------+-------------------------------+---------------------------+
|    last_name|first_name|         city|state|       npi|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|          specialty|sum_total_amount_of_payment_usd|total_amount_of_payment_usd|
+-------------+----------+-------------+-----+----------+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+-------------------+-------------------------------+---------------------------+
|        A

                                                                                

In [77]:

pay_part_d_fpd.count()

                                                                                

1017417

In [78]:
###LEIE DATA PROCESSING

In [79]:
leie_data_path = "../DataSet/LEIE.csv"
leie_df = spark.read.csv(leie_data_path, header=True, inferSchema=True)
leie_df.show()

+--------+---------+-------+--------------------+------------------+------------------+----+----------+----+--------------------+----------+-----+-----+--------+--------+--------+----------+--------+
|LASTNAME|FIRSTNAME|MIDNAME|             BUSNAME|           GENERAL|         SPECIALTY|UPIN|       NPI| DOB|             ADDRESS|      CITY|STATE|  ZIP|EXCLTYPE|EXCLDATE|REINDATE|WAIVERDATE|WVRSTATE|
+--------+---------+-------+--------------------+------------------+------------------+----+----------+----+--------------------+----------+-----+-----+--------+--------+--------+----------+--------+
|    NULL|     NULL|   NULL|#1 MARKETING SERV...|    OTHER BUSINESS|        SOBER HOME|NULL|         0|NULL|239 BRIGHTON BEAC...|  BROOKLYN|   NY|11235|  1128a1|20200319|       0|         0|    NULL|
|    NULL|     NULL|   NULL|    1 BEST CARE, INC|    OTHER BUSINESS|HOME HEALTH AGENCY|NULL|         0|NULL|2161 UNIVERSITY A...|SAINT PAUL|   MN|55114|  1128b5|20230518|       0|         0|    NULL|


In [80]:
leie_df.printSchema()

root
 |-- LASTNAME: string (nullable = true)
 |-- FIRSTNAME: string (nullable = true)
 |-- MIDNAME: string (nullable = true)
 |-- BUSNAME: string (nullable = true)
 |-- GENERAL: string (nullable = true)
 |-- SPECIALTY: string (nullable = true)
 |-- UPIN: string (nullable = true)
 |-- NPI: integer (nullable = true)
 |-- DOB: integer (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- ZIP: string (nullable = true)
 |-- EXCLTYPE: string (nullable = true)
 |-- EXCLDATE: string (nullable = true)
 |-- REINDATE: integer (nullable = true)
 |-- WAIVERDATE: integer (nullable = true)
 |-- WVRSTATE: string (nullable = true)



In [81]:
leie_df1 = leie_df.select(
    col('NPI').alias("npi"),
    col('EXCLTYPE').alias("is_fraud"),
    col('EXCLDATE'),
    col('REINDATE'),
    col('WAIVERDATE')
)
leie_df1.show()

+----------+--------+--------+--------+----------+
|       npi|is_fraud|EXCLDATE|REINDATE|WAIVERDATE|
+----------+--------+--------+--------+----------+
|         0|  1128a1|20200319|       0|         0|
|         0|  1128b5|20230518|       0|         0|
|1972902351|  1128b8|20220320|       0|         0|
|         0|  1128a1|19880830|       0|         0|
|         0|  1128b7|19970620|       0|         0|
|         0|  1128b6|20230227|       0|         0|
|1922348218|  1128a1|20180419|       0|         0|
|         0|  1128b5|20090319|       0|         0|
|         0|  1128a1|19940524|       0|         0|
|         0|  1128b8|20020919|       0|         0|
|         0|  1128a1|20110818|       0|         0|
|         0|  1128b8|20030720|       0|         0|
|         0|  1128b5|19970903|       0|         0|
|         0|  1128a1|19910704|       0|         0|
|         0|  1128b8|19861204|       0|         0|
|         0|  1128a1|20190620|       0|         0|
|         0|  1128a1|19911229| 

In [82]:
#Select valid leie data

leie_df2 = leie_df1.filter(col('npi') != 0)
leie_df2.show()

+----------+--------+--------+--------+----------+
|       npi|is_fraud|EXCLDATE|REINDATE|WAIVERDATE|
+----------+--------+--------+--------+----------+
|1972902351|  1128b8|20220320|       0|         0|
|1922348218|  1128a1|20180419|       0|         0|
|1942476080|  1128b8|20170518|       0|         0|
|1275600959|  1128a1|20130320|       0|         0|
|1891731758|  1128b8|20170518|       0|         0|
|1265830335|  1128a1|20220818|       0|         0|
|1851631543|  1128b7|20190326|       0|         0|
|1902198435|  1128a1|20160120|       0|         0|
|1073916631|  1128b7|20210816|       0|         0|
|1437510278|  1128a1|20230420|       0|         0|
|1073682936|  1128b7|20150409|       0|         0|
|1902166028|  1128b8|20170518|       0|         0|
|1992906937|  1128b8|20140720|       0|         0|
|1104947944|  1128a1|20140820|       0|         0|
|1164669479|  1128a1|20161020|       0|         0|
|1043302250|  1128a1|20160720|       0|         0|
|1801231436|  1128a1|20211029| 

In [83]:
#Set is fraud as 1

leie_df2 = leie_df2.withColumn(target, lit(1))
# leie_df2 = leie_df2.withColumn('is_fraud', when(col('is_fraud').isNull(), lit(0.0)).otherwise(lit(1.0)))

leie_df2.show()

+----------+--------+--------+--------+----------+
|       npi|is_fraud|EXCLDATE|REINDATE|WAIVERDATE|
+----------+--------+--------+--------+----------+
|1972902351|       1|20220320|       0|         0|
|1922348218|       1|20180419|       0|         0|
|1942476080|       1|20170518|       0|         0|
|1275600959|       1|20130320|       0|         0|
|1891731758|       1|20170518|       0|         0|
|1265830335|       1|20220818|       0|         0|
|1851631543|       1|20190326|       0|         0|
|1902198435|       1|20160120|       0|         0|
|1073916631|       1|20210816|       0|         0|
|1437510278|       1|20230420|       0|         0|
|1073682936|       1|20150409|       0|         0|
|1902166028|       1|20170518|       0|         0|
|1992906937|       1|20140720|       0|         0|
|1104947944|       1|20140820|       0|         0|
|1164669479|       1|20161020|       0|         0|
|1043302250|       1|20160720|       0|         0|
|1801231436|       1|20211029| 

In [84]:
leie_df2.printSchema()

root
 |-- npi: integer (nullable = true)
 |-- is_fraud: integer (nullable = false)
 |-- EXCLDATE: string (nullable = true)
 |-- REINDATE: integer (nullable = true)
 |-- WAIVERDATE: integer (nullable = true)



In [85]:
#left join (payment + part D ) with leie transformed data

feature_1 = pay_part_d_fpd.join(leie_df2, on='npi', how='left')

In [86]:
feature_1.printSchema()

root
 |-- npi: integer (nullable = true)
 |-- last_name: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sum_total_drug_cost: double (nullable = true)
 |-- mean_total_drug_cost: double (nullable = true)
 |-- max_total_drug_cost: double (nullable = true)
 |-- sum_total_claim_count: long (nullable = true)
 |-- mean_total_claim_count: double (nullable = true)
 |-- max_total_claim_count: integer (nullable = true)
 |-- sum_total_day_supply: long (nullable = true)
 |-- mean_total_day_supply: double (nullable = true)
 |-- max_total_day_supply: integer (nullable = true)
 |-- specialty: string (nullable = true)
 |-- sum_total_amount_of_payment_usd: double (nullable = true)
 |-- total_amount_of_payment_usd: float (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- EXCLDATE: string (nullable = true)
 |-- REINDATE: integer (nullable = true)
 |-- WAIVERDATE: integer (nullable = true)



In [87]:
#Filling 0 for NA
feature_1 = feature_1.fillna(0)
feature_1.show()

[Stage 179:>                                                        (0 + 4) / 4]

+----------+-------------+----------+-------------+-----+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+-------------------+-------------------------------+---------------------------+--------+--------+--------+----------+
|       npi|    last_name|first_name|         city|state|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|          specialty|sum_total_amount_of_payment_usd|total_amount_of_payment_usd|is_fraud|EXCLDATE|REINDATE|WAIVERDATE|
+----------+-------------+----------+-------------+-----+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+---------------

                                                                                

In [88]:
#Set is_fraud as 0 in case is_fraud not present

feature_1 = feature_1.withColumn(target, when(col(target) == 1.0, lit(1.0)).otherwise(lit(0.0)))


In [89]:
# feature_1.count()

In [90]:
# feature_1.filter(col('is_fraud') == 1).count()
# 

In [91]:
# feature_1.filter(col('is_fraud') == 0).count()
#

In [92]:
# feature_l = feature_1.limit(50000)

In [93]:
# feature_l.count()

In [94]:
# feature_2 = feature_1.filter(col('is_fraud') == 1)

In [95]:
# feature_2.count()

In [96]:
# feature_3 = feature_2.union(feature_l)

In [97]:
# feature_3.count()

In [98]:
feature_all_df = feature_1

In [99]:
feature_all_df.printSchema()

root
 |-- npi: integer (nullable = true)
 |-- last_name: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sum_total_drug_cost: double (nullable = false)
 |-- mean_total_drug_cost: double (nullable = false)
 |-- max_total_drug_cost: double (nullable = false)
 |-- sum_total_claim_count: long (nullable = true)
 |-- mean_total_claim_count: double (nullable = false)
 |-- max_total_claim_count: integer (nullable = true)
 |-- sum_total_day_supply: long (nullable = true)
 |-- mean_total_day_supply: double (nullable = false)
 |-- max_total_day_supply: integer (nullable = true)
 |-- specialty: string (nullable = true)
 |-- sum_total_amount_of_payment_usd: double (nullable = false)
 |-- total_amount_of_payment_usd: float (nullable = false)
 |-- is_fraud: double (nullable = false)
 |-- EXCLDATE: string (nullable = true)
 |-- REINDATE: integer (nullable = true)
 |-- WAIVERDATE: integer (nullable = true)


In [224]:
# Scaling the features
# Apply log transformation and create new columns for differences


feature_all_df = (feature_all_df
                  .withColumn('sum_total_drug_cost', log10(col('sum_total_drug_cost') + 1.0))
                  .withColumn('sum_total_claim_count', log10(col('sum_total_claim_count') + 1.0))
                  .withColumn('sum_total_day_supply', log10(col('sum_total_day_supply') + 1.0))
                  .withColumn('sum_total_amount_of_payment_usd', log10(col('sum_total_amount_of_payment_usd') + 1.0))
                  .withColumn('mean_total_drug_cost', log10(col('mean_total_drug_cost') + 1.0))
                  .withColumn('mean_total_claim_count', log10(col('mean_total_claim_count') + 1.0))
                  .withColumn('mean_total_day_supply', log10(col('mean_total_day_supply') + 1.0))
                  .withColumn('max_total_drug_cost', log10(col('max_total_drug_cost') + 1.0))
                  .withColumn('max_total_claim_count', log10(col('max_total_claim_count') + 1.0))
                  .withColumn('max_total_day_supply', log10(col('max_total_day_supply') + 1.0))
                  .withColumn('claim_max_mean', col('max_total_claim_count') - col('mean_total_claim_count'))
                  .withColumn('supply_max_mean', col('max_total_day_supply') - col('max_total_day_supply'))
                  .withColumn('drug_max_mean', col('max_total_drug_cost') - col('mean_total_drug_cost')))

feature_all_df.show()



+----------+------------+----------+--------------+-----+-------------------+--------------------+-------------------+---------------------+----------------------+---------------------+--------------------+---------------------+--------------------+--------------------+-------------------------------+---------------------------+--------+--------+--------+----------+--------------------+---------------+--------------------+
|       npi|   last_name|first_name|          city|state|sum_total_drug_cost|mean_total_drug_cost|max_total_drug_cost|sum_total_claim_count|mean_total_claim_count|max_total_claim_count|sum_total_day_supply|mean_total_day_supply|max_total_day_supply|           specialty|sum_total_amount_of_payment_usd|total_amount_of_payment_usd|is_fraud|EXCLDATE|REINDATE|WAIVERDATE|      claim_max_mean|supply_max_mean|       drug_max_mean|
+----------+------------+----------+--------------+-----+-------------------+--------------------+-------------------+---------------------+------

                                                                                

In [225]:
categorical_features = ['npi','last_name', 'specialty','first_name','city', 'state']

numerical_features = ['sum_total_drug_cost', 'mean_total_drug_cost','total_amount_of_payment_usd',
                        'max_total_drug_cost', 'sum_total_claim_count',
                        'mean_total_claim_count', 'max_total_claim_count',
                        'sum_total_day_supply', 'mean_total_day_supply', 'max_total_day_supply',
                        'claim_max_mean','supply_max_mean', 'drug_max_mean']


In [226]:
# scikit learn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import pandas as pd
import numpy as np


import pandas as pd
import numpy as np
import scipy
import os 

import matplotlib.pyplot as plt


from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn import ensemble 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import brier_score_loss, precision_score, recall_score,f1_score, roc_auc_score, accuracy_score 
from sklearn.metrics import confusion_matrix, roc_curve

from sklearn.preprocessing import StandardScaler 
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans

import random

from scipy.stats import ttest_ind

In [227]:
feature_all = categorical_features + numerical_features + ['is_fraud']

In [228]:
#Save features in csv file
import shutil
path = './features_all'

if os.path.exists(path):
    shutil.rmtree(path)

feature_all_df.write.option("header", "true").csv(path)


                                                                                

In [229]:
#Batch size for csv read
chunk_size = 10000 

In [230]:
import glob

In [231]:


#Read all *csv from features_all folder
csv_files = glob.glob(f"{path}/part-*.csv")

features_all_pd = pd.DataFrame()

#Read in Batches
for filename in csv_files:
    for chunk in pd.read_csv(filename, chunksize=chunk_size):
        features_all_pd = pd.concat([features_all_pd, chunk], ignore_index=True)

In [232]:
features_all_pd.count()

npi                                1017418
last_name                          1017418
first_name                         1017416
city                               1017417
state                              1017418
sum_total_drug_cost                1017418
mean_total_drug_cost               1017418
max_total_drug_cost                1017418
sum_total_claim_count              1017418
mean_total_claim_count             1017418
max_total_claim_count              1017418
sum_total_day_supply               1017418
mean_total_day_supply              1017418
max_total_day_supply               1017418
specialty                          1017413
sum_total_amount_of_payment_usd    1017418
total_amount_of_payment_usd        1017418
is_fraud                           1017418
EXCLDATE                               240
REINDATE                           1017418
WAIVERDATE                         1017418
claim_max_mean                     1017418
supply_max_mean                    1017418
drug_max_me

In [233]:
y = features_all_pd[target].values

In [234]:
x = features_all_pd[feature_all].drop(target,axis=1)

In [235]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=0)


In [236]:
print(x_train.shape)
print(x_valid.shape)

(813934, 19)
(203484, 19)


In [237]:
x_train[numerical_features] = x_train.loc[:,numerical_features].fillna(0) 
x_valid[numerical_features] = x_valid.loc[:,numerical_features].fillna(0) 

In [238]:
x_train[categorical_features] = x_train.loc[:,categorical_features].fillna('NA') 
x_valid[categorical_features] = x_valid.loc[:,categorical_features].fillna('NA')

In [239]:
scaler= StandardScaler() 
x_train[numerical_features] = scaler.fit_transform(x_train[numerical_features].values)
x_valid[numerical_features] = scaler.transform(x_valid[numerical_features].values)

In [240]:
# Combining all feature names
all_features = categorical_features + numerical_features + ['target']


In [241]:
print(x_train[numerical_features].dtypes)

sum_total_drug_cost            float64
mean_total_drug_cost           float64
total_amount_of_payment_usd    float64
max_total_drug_cost            float64
sum_total_claim_count          float64
mean_total_claim_count         float64
max_total_claim_count          float64
sum_total_day_supply           float64
mean_total_day_supply          float64
max_total_day_supply           float64
claim_max_mean                 float64
supply_max_mean                float64
drug_max_mean                  float64
dtype: object


In [242]:
iloc_ran = features_all_pd.index.values
np.random.shuffle(iloc_ran)


In [243]:
pd_len = len(features_all_pd)

train_len = int(pd_len * 0.8)

iloc_train = iloc_ran[:train_len]
iloc_valid = iloc_ran[train_len:]

In [244]:
pd_train = features_all_pd.iloc[iloc_train]
pd_valid = features_all_pd.iloc[iloc_valid]

In [245]:
print(len(iloc_train))
print(len(iloc_valid))

813934
203484


In [246]:
# part_d_drug_df_lim = feature_3.limit(100000)
# part_d_drug_pd = part_d_drug_df_lim.toPandas()

# part_d_drug_pd_tmp = part_d_drug_df.join(feature_all_df.select('npi'), on='npi', how='inner')
# part_d_drug_pd_tmp = part_d_drug_pd_tmmp.limit(500000)

# part_d_drug_pd['npi'] = part_d_drug_pd.npi.astype(object)

In [247]:
path = './part_d_drug_pd'


if os.path.exists(path):
    shutil.rmtree(path)

part_d_drug_df.write.option("header", "true").csv(path)

                                                                                

In [248]:
all_csv_files = glob.glob(f"{path}/part-*.csv")
part_d_drug_pd = pd.DataFrame()

# Iterate over each file
for filename in all_csv_files:
    # Read the file in chunks (if needed)
    for chunk in pd.read_csv(filename, chunksize=chunk_size):
        part_d_drug_pd = pd.concat([part_d_drug_pd, chunk], ignore_index=True)

In [249]:
# # Initialize an empty DataFrame for the entire data
# part_d_drug_pd = pd.DataFrame()

# # Read the CSV files in chunks
# for chunk in pd.read_csv("./part_d_drug_pd/*csv", chunksize=chunk_size):
#     # Process each chunk (if needed) and append to the full DataFrame
#     part_d_drug_pd = pd.concat([full_df, chunk], ignore_index=True)

In [250]:
# part_d_drug_df.limit(100000).filter(col('is_fraud') == 1)

In [251]:
# Drug Weighted_Scores

part_d_drug_pd_train = pd.merge(part_d_drug_pd,pd_train[['npi','is_fraud']], how='inner', on=['npi'])


In [252]:
part_d_drug_pd_train.head()

Unnamed: 0,npi,drug_name,total_drug_cost,total_claim_count,total_day_supply,specialty,is_fraud
0,1487081030,Levocetirizine Dihydrochloride,498.02,17,1410,Family Practice,0.0
1,1487081030,Levofloxacin,112.06,26,175,Family Practice,0.0
2,1487081030,Levothyroxine Sodium,6467.23,342,24813,Family Practice,0.0
3,1487081030,Linzess,13355.42,17,810,Family Practice,0.0
4,1487081030,Lisinopril,2112.63,241,16275,Family Practice,0.0


In [253]:
part_d_drug_pd_all = pd.merge(part_d_drug_pd,features_all_pd[['npi','is_fraud']], how='inner', on=['npi'])

In [254]:
part_d_drug_pd.head()

Unnamed: 0,npi,drug_name,total_drug_cost,total_claim_count,total_day_supply,specialty
0,1487081030,Levocetirizine Dihydrochloride,498.02,17,1410,Family Practice
1,1487081030,Levofloxacin,112.06,26,175,Family Practice
2,1487081030,Levothyroxine Sodium,6467.23,342,24813,Family Practice
3,1487081030,Linzess,13355.42,17,810,Family Practice
4,1487081030,Lisinopril,2112.63,241,16275,Family Practice


In [255]:
print(len(part_d_drug_pd_train[part_d_drug_pd_train[target]==1]))

8887


In [256]:
# Distinct Drug Name
drugs = set([ drugx for drugx in part_d_drug_pd_train['drug_name'].values if isinstance(drugx, str)])
print(len(drugs))

2958


In [257]:
part_d_drug_pd_train.head(5)

Unnamed: 0,npi,drug_name,total_drug_cost,total_claim_count,total_day_supply,specialty,is_fraud
0,1487081030,Levocetirizine Dihydrochloride,498.02,17,1410,Family Practice,0.0
1,1487081030,Levofloxacin,112.06,26,175,Family Practice,0.0
2,1487081030,Levothyroxine Sodium,6467.23,342,24813,Family Practice,0.0
3,1487081030,Linzess,13355.42,17,810,Family Practice,0.0
4,1487081030,Lisinopril,2112.63,241,16275,Family Practice,0.0


In [258]:
print("Training Record Count : ")
print(len(part_d_drug_pd_train))
print("Fraud Record Count in Training: ")
print(len(part_d_drug_pd_train[part_d_drug_pd_train[target]==1]))

Training Record Count : 
20193881
Fraud Record Count in Training: 
8887


In [259]:
part_d_drug_pd_train.head()

Unnamed: 0,npi,drug_name,total_drug_cost,total_claim_count,total_day_supply,specialty,is_fraud
0,1487081030,Levocetirizine Dihydrochloride,498.02,17,1410,Family Practice,0.0
1,1487081030,Levofloxacin,112.06,26,175,Family Practice,0.0
2,1487081030,Levothyroxine Sodium,6467.23,342,24813,Family Practice,0.0
3,1487081030,Linzess,13355.42,17,810,Family Practice,0.0
4,1487081030,Lisinopril,2112.63,241,16275,Family Practice,0.0


In [260]:
cols = ['total_drug_cost','total_claim_count','total_day_supply']

In [261]:
part_d_drug_pd_train_group = part_d_drug_pd_train.groupby(['drug_name', 'is_fraud'])
part_d_drug_all_group = part_d_drug_pd_all.groupby(['drug_name', 'is_fraud'])

In [262]:
drug_keys = part_d_drug_pd_train_group.groups.keys()
print(len(drug_keys))

3789


In [263]:
drug_keys

dict_keys([('1st Tier Unifine Pentips', 0.0), ('1st Tier Unifine Pentips Plus', 0.0), ('Abacavir', 0.0), ('Abacavir', 1.0), ('Abacavir-Lamivudine', 0.0), ('Abacavir-Lamivudine', 1.0), ('Abacavir-Lamivudine-Zidovudine', 0.0), ('Abelcet', 0.0), ('Abilify', 0.0), ('Abilify Maintena', 0.0), ('Abilify Maintena', 1.0), ('Abiraterone Acetate', 0.0), ('Abiraterone Acetate', 1.0), ('Abraxane', 0.0), ('Acamprosate Calcium', 0.0), ('Acarbose', 0.0), ('Acarbose', 1.0), ('Accolate', 0.0), ('Accupril', 0.0), ('Acebutolol Hcl', 0.0), ('Acebutolol Hcl', 1.0), ('Acetamin-Caff-Dihydrocodeine', 0.0), ('Acetaminophen', 0.0), ('Acetaminophen-Codeine', 0.0), ('Acetaminophen-Codeine', 1.0), ('Acetazolamide', 0.0), ('Acetazolamide', 1.0), ('Acetazolamide Er', 0.0), ('Acetic Acid', 0.0), ('Acetic Acid', 1.0), ('Acetylcysteine', 0.0), ('Acetylcysteine', 1.0), ('Aciphex', 0.0), ('Acitretin', 0.0), ('Actemra', 0.0), ('Actemra Actpen', 0.0), ('Acthar', 0.0), ('Acthib', 0.0), ('Actimmune', 0.0), ('Activella', 0.0),

In [264]:
drug_with_isfraud = [drugx for drugx in drugs if ((drugx,0.0) in drug_keys ) & ( (drugx,1.0) in drug_keys)]

In [265]:
re_drug_tt = dict()
for drugx in drug_with_isfraud:
    for colx in cols:
        fraud_0 = part_d_drug_pd_train_group.get_group((drugx,0.0))[colx].values
        fraud_1 = part_d_drug_pd_train_group.get_group((drugx,1.0))[colx].values
        # print len(fraud_0), len(fraud_1)
        if (len(fraud_0)>2) & (len(fraud_1)>2) :
            tt = ttest_ind(fraud_0, fraud_1)
            re_drug_tt[(drugx, colx)] = tt

In [266]:
#Probabilities Configuration

prob_05 = [(key, p) for (key, (t, p)) in re_drug_tt.items() if p <=0.05]  
print(len(prob_05))

340


In [267]:
inx=100
drug_name = prob_05[inx][0][0]
print(drug_name)
df_bar = pd.concat([part_d_drug_all_group.get_group((prob_05[inx][0][0],0.0)), part_d_drug_all_group.get_group((prob_05[inx][0][0],1.0))])
df_bar.head()

Zolpidem Tartrate


Unnamed: 0,npi,drug_name,total_drug_cost,total_claim_count,total_day_supply,specialty,is_fraud
77,1487081030,Zolpidem Tartrate,219.13,37,1610,Family Practice,0.0
455,1487083234,Zolpidem Tartrate,72.96,15,450,Nurse Practitioner,0.0
516,1487084406,Zolpidem Tartrate,215.1,31,849,Nurse Practitioner,0.0
655,1487085254,Zolpidem Tartrate,99.51,20,600,Internal Medicine,0.0
867,1487085593,Zolpidem Tartrate,478.52,60,2370,Nurse Practitioner,0.0


In [268]:
feature_drug_weighted = []
new_col_all =[]
for i, p005x in enumerate(prob_05):
    #if i>4:
    #   break
    drug_name = p005x[0][0]
    cat_name = p005x[0][1] 
    
    new_col = drug_name+'_'+cat_name
    new_col_all.append(new_col)

    drug_0 = part_d_drug_all_group.get_group((drug_name,0.0))[['npi', cat_name]]
    drug_1 = part_d_drug_all_group.get_group((drug_name,1.0))[['npi', cat_name]]

    drug_01 = pd.concat([drug_0, drug_1])
    drug_01.rename(columns={cat_name: new_col}, inplace=True)
    feature_drug_weighted.append(drug_01)

In [269]:
npi_col = features_all_pd[['npi']]

w_npi = []

for n, nx in enumerate(feature_drug_weighted):
      nggx = pd.merge(npi_col, nx.drop_duplicates(['npi']), on='npi', how='left')
      w_npi.append(nggx)

In [270]:
features_all_pd1 = features_all_pd

In [271]:
for wx in w_npi:
    col_n = wx.columns[1]
    features_all_pd1[col_n] = wx[col_n].values
    
wx = w_npi[0]
wx.columns[1]
col_n = wx.columns[1]

  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features_all_pd1[col_n] = wx[col_n].values
  features

In [272]:
len(wx[col_n].values)
features_all_pd1.fillna(0)

Unnamed: 0,npi,last_name,first_name,city,state,sum_total_drug_cost,mean_total_drug_cost,max_total_drug_cost,sum_total_claim_count,mean_total_claim_count,...,Fluoxetine Hcl_total_claim_count,Fluoxetine Hcl_total_day_supply,Oxybutynin Chloride_total_drug_cost,Oxybutynin Chloride_total_claim_count,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply
0,1003000142,Khalil,Rashid,Toledo,OH,0.738932,0.619091,0.676030,0.618574,0.453142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
1,1003001256,Caragol,Jennifer,Evans,CO,0.679980,0.561206,0.644386,0.529176,0.353860,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
2,1003002320,Eklund,D.,Pearl,MS,0.480563,0.480563,0.480563,0.373230,0.373230,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3,1003002858,Gibson,Amy,Plainfield,NJ,0.749870,0.673823,0.733236,0.504968,0.364437,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
4,1003003153,Morrison,Laura,Seattle,WA,0.797377,0.729853,0.787527,0.544538,0.416662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017413,1992974703,Beg,Maria,Mequon,WI,0.752344,0.600123,0.707922,0.607121,0.378827,...,16.0,660.0,0.0,0.0,0.0,0.0,12.0,612.66,11.0,447.0
1017414,1992986434,Marsh,Robert,Morgantown,WV,0.551866,0.513692,0.534380,0.432580,0.382912,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
1017415,1992990238,Vaughn,Wallisa,Roanoke,VA,0.462844,0.462844,0.462844,0.325093,0.325093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
1017416,1992997522,Nguyen,Dao,Miami,FL,0.575325,0.486461,0.549139,0.466019,0.351316,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.00,0.0,0.0


In [273]:
len(wx[col_n].values)
features_all_pd1.fillna(0)

Unnamed: 0,npi,last_name,first_name,city,state,sum_total_drug_cost,mean_total_drug_cost,max_total_drug_cost,sum_total_claim_count,mean_total_claim_count,...,Fluoxetine Hcl_total_claim_count,Fluoxetine Hcl_total_day_supply,Oxybutynin Chloride_total_drug_cost,Oxybutynin Chloride_total_claim_count,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply
0,1003000142,Khalil,Rashid,Toledo,OH,0.738932,0.619091,0.676030,0.618574,0.453142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
1,1003001256,Caragol,Jennifer,Evans,CO,0.679980,0.561206,0.644386,0.529176,0.353860,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
2,1003002320,Eklund,D.,Pearl,MS,0.480563,0.480563,0.480563,0.373230,0.373230,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
3,1003002858,Gibson,Amy,Plainfield,NJ,0.749870,0.673823,0.733236,0.504968,0.364437,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
4,1003003153,Morrison,Laura,Seattle,WA,0.797377,0.729853,0.787527,0.544538,0.416662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017413,1992974703,Beg,Maria,Mequon,WI,0.752344,0.600123,0.707922,0.607121,0.378827,...,16.0,660.0,0.0,0.0,0.0,0.0,12.0,612.66,11.0,447.0
1017414,1992986434,Marsh,Robert,Morgantown,WV,0.551866,0.513692,0.534380,0.432580,0.382912,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
1017415,1992990238,Vaughn,Wallisa,Roanoke,VA,0.462844,0.462844,0.462844,0.325093,0.325093,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
1017416,1992997522,Nguyen,Dao,Miami,FL,0.575325,0.486461,0.549139,0.466019,0.351316,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.00,0.0,0.0


In [274]:
features_all_pd1[new_col_all].describe()

Unnamed: 0,Clonidine Hcl_total_drug_cost,Clonidine Hcl_total_claim_count,Clonidine Hcl_total_day_supply,Simvastatin_total_claim_count,Metformin Hcl_total_drug_cost,Metformin Hcl_total_claim_count,Metformin Hcl_total_day_supply,Methylphenidate Hcl_total_claim_count,Venlafaxine Hcl_total_claim_count,Losartan-Hydrochlorothiazide_total_drug_cost,...,Fluoxetine Hcl_total_claim_count,Fluoxetine Hcl_total_day_supply,Oxybutynin Chloride_total_drug_cost,Oxybutynin Chloride_total_claim_count,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply
count,95187.0,95187.0,95187.0,212930.0,238056.0,238056.0,238056.0,18035.0,19610.0,112460.0,...,153320.0,153320.0,61863.0,61863.0,4002.0,4002.0,248688.0,120046.0,120046.0,120046.0
mean,298.624908,31.320201,1625.370313,85.551782,744.48812,99.216529,6879.845919,21.986693,19.882815,953.427485,...,40.928489,2214.756424,687.644981,27.284031,27.303098,1281.148926,45.430286,1640.922027,38.892016,2072.571631
std,328.920615,36.755251,1665.494054,97.437199,881.199199,113.75579,7647.861328,18.214275,17.002612,1042.39479,...,56.282411,2264.839335,699.546859,26.840435,36.1377,1653.809065,73.550662,1833.988376,49.885129,2211.090293
min,0.0,11.0,11.0,11.0,0.0,11.0,70.0,11.0,11.0,0.0,...,11.0,62.0,0.0,11.0,11.0,150.0,11.0,10.03,11.0,55.0
25%,120.38,14.0,750.0,25.0,190.45,27.0,1742.0,12.0,12.0,378.52,...,17.0,990.0,304.71,14.0,13.0,515.0,16.0,641.8025,16.0,900.0
50%,210.38,21.0,1170.0,53.0,454.105,62.0,4271.0,16.0,15.0,656.62,...,27.0,1600.0,509.11,19.0,17.0,840.0,25.0,1092.205,25.0,1440.0
75%,364.975,35.0,1920.0,111.0,975.9525,130.0,9280.25,24.0,21.0,1160.3775,...,47.0,2760.0,839.175,30.0,28.0,1370.0,46.0,1963.485,43.0,2490.0
max,25003.47,3132.0,90014.0,5252.0,27352.89,4204.0,226631.0,358.0,608.0,26783.3,...,4817.0,147886.0,25903.23,968.0,636.0,24825.0,3626.0,44357.94,3023.0,92277.0


In [275]:
features_all_pd1['drug_mean'] = features_all_pd1[new_col_all].mean(axis=1)

  features_all_pd1['drug_mean'] = features_all_pd1[new_col_all].mean(axis=1)


In [276]:
features_all_pd['drug_mean'] = features_all_pd['drug_mean'].map(lambda x: np.log10(x + 1.0))

In [277]:
features_all_pd1['drug_sum'] = features_all_pd1[new_col_all].sum(axis=1)

  features_all_pd1['drug_sum'] = features_all_pd1[new_col_all].sum(axis=1)


In [278]:

features_all_pd['drug_sum'] = features_all_pd['drug_sum'].map(lambda x: np.log10(x + 1.0))

In [279]:
features_all_pd1['drug_variance'] = features_all_pd1[new_col_all].var(axis=1)

  features_all_pd1['drug_variance'] = features_all_pd1[new_col_all].var(axis=1)


In [280]:
pd_train = features_all_pd1.iloc[iloc_train]
pd_valid = features_all_pd1.iloc[iloc_valid]



In [281]:
pd_train.fillna(0)
pd_valid.fillna(0)

Unnamed: 0,npi,last_name,first_name,city,state,sum_total_drug_cost,mean_total_drug_cost,max_total_drug_cost,sum_total_claim_count,mean_total_claim_count,...,Oxybutynin Chloride_total_claim_count,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply,drug_mean,drug_sum,drug_variance
192441,1770533184,Desai,Roohi,Saint Louis,MO,0.827009,0.655544,0.778437,0.700458,0.452279,...,26.0,0.0,0.0,40.0,3001.95,75.0,5550.0,3.217416,5.504956,1.122920e+07
824802,1285720045,Davis,Annette,Louisville,KY,0.744363,0.654141,0.723123,0.521966,0.362663,...,0.0,0.0,0.0,0.0,0.00,0.0,0.0,2.085249,2.860416,1.523170e+04
191490,1760896906,Sydloski,Mitchell,Caledonia,MI,0.815084,0.666053,0.760702,0.657998,0.425156,...,0.0,0.0,0.0,24.0,1093.16,16.0,1380.0,3.066465,5.091402,4.991191e+06
815099,1245719731,Forstall,Lillian,Amite,LA,0.548135,0.509629,0.525210,0.388662,0.334729,...,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.000000e+00
37999,1154384212,Thomas,Jason,Ahoskie,NC,0.517327,0.449986,0.484994,0.442613,0.363275,...,0.0,0.0,0.0,0.0,0.00,0.0,0.0,1.988150,2.762566,6.220591e+03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718925,1861723496,Deyoung,Derek,Raleigh,NC,0.693669,0.590891,0.657864,0.586015,0.450179,...,0.0,0.0,0.0,0.0,0.00,0.0,0.0,2.806352,4.237062,7.897721e+05
80794,1326111329,Neril,Morton,Orinda,CA,0.520011,0.520011,0.520011,0.317892,0.317892,...,0.0,0.0,0.0,0.0,0.00,0.0,0.0,2.261525,2.737057,2.581055e+04
969616,1861921397,Saridakis Phillippi,Ellen,Mc Murray,PA,0.688563,0.556770,0.632474,0.560201,0.375033,...,0.0,0.0,0.0,18.0,0.00,0.0,0.0,2.678544,4.154784,3.609742e+05
572359,1295046621,Maharaj-Prasad,Philip,Tucson,AZ,0.742486,0.607477,0.711578,0.590800,0.386706,...,0.0,0.0,0.0,24.0,0.00,0.0,0.0,2.461384,4.184185,2.085582e+05


In [282]:
features_all_pd1

Unnamed: 0,npi,last_name,first_name,city,state,sum_total_drug_cost,mean_total_drug_cost,max_total_drug_cost,sum_total_claim_count,mean_total_claim_count,...,Oxybutynin Chloride_total_claim_count,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply,drug_mean,drug_sum,drug_variance
0,1003000142,Khalil,Rashid,Toledo,OH,0.738932,0.619091,0.676030,0.618574,0.453142,...,,,,,,,,3.121482,4.793259,4.666716e+06
1,1003001256,Caragol,Jennifer,Evans,CO,0.679980,0.561206,0.644386,0.529176,0.353860,...,,,,,,,,2.554327,3.914895,3.462823e+05
2,1003002320,Eklund,D.,Pearl,MS,0.480563,0.480563,0.480563,0.373230,0.373230,...,,,,,,,,,0.000000,
3,1003002858,Gibson,Amy,Plainfield,NJ,0.749870,0.673823,0.733236,0.504968,0.364437,...,,,,,,,,2.074841,3.025822,2.840467e+04
4,1003003153,Morrison,Laura,Seattle,WA,0.797377,0.729853,0.787527,0.544538,0.416662,...,,,,,,,,2.947189,3.247973,1.333344e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017413,1992974703,Beg,Maria,Mequon,WI,0.752344,0.600123,0.707922,0.607121,0.378827,...,,,,12.0,612.66,11.0,447.0,2.549636,4.399685,3.192901e+05
1017414,1992986434,Marsh,Robert,Morgantown,WV,0.551866,0.513692,0.534380,0.432580,0.382912,...,,,,,,,,2.354729,3.131279,9.978822e+04
1017415,1992990238,Vaughn,Wallisa,Roanoke,VA,0.462844,0.462844,0.462844,0.325093,0.325093,...,,,,,,,,,0.000000,
1017416,1992997522,Nguyen,Dao,Miami,FL,0.575325,0.486461,0.549139,0.466019,0.351316,...,,,,11.0,,,,2.169808,3.281031,5.129899e+04


In [283]:
pd_valid.columns

Index(['npi', 'last_name', 'first_name', 'city', 'state',
       'sum_total_drug_cost', 'mean_total_drug_cost', 'max_total_drug_cost',
       'sum_total_claim_count', 'mean_total_claim_count',
       ...
       'Oxybutynin Chloride_total_claim_count',
       'Alcohol Pads_total_claim_count', 'Alcohol Pads_total_day_supply',
       'Prednisone_total_claim_count', 'Bupropion Xl_total_drug_cost',
       'Bupropion Xl_total_claim_count', 'Bupropion Xl_total_day_supply',
       'drug_mean', 'drug_sum', 'drug_variance'],
      dtype='object', length=367)

In [284]:
#Speciality

speciality_dict =[]
spec_fraud_1 = pd_train[pd_train[target]==1]['specialty']

In [285]:
from collections import Counter
counts = Counter(spec_fraud_1)
speciality_dict =  dict(counts)

In [286]:
features_all_pd1['spec_weight'] = features_all_pd1['specialty'].map(lambda x: speciality_dict.get(x, 0))

  features_all_pd1['spec_weight'] = features_all_pd1['specialty'].map(lambda x: speciality_dict.get(x, 0))


In [287]:
pd_train = features_all_pd1.iloc[iloc_train]
pd_valid = features_all_pd1.iloc[iloc_valid]

In [288]:
len(pd_train[pd_train[target] == 1])


182

In [289]:
print(pd_train.dtypes)

npi                                int64
last_name                         object
first_name                        object
city                              object
state                             object
                                  ...   
Bupropion Xl_total_day_supply    float64
drug_mean                        float64
drug_sum                         float64
drug_variance                    float64
spec_weight                        int64
Length: 368, dtype: object


In [290]:
pd_train.fillna(0)

Unnamed: 0,npi,last_name,first_name,city,state,sum_total_drug_cost,mean_total_drug_cost,max_total_drug_cost,sum_total_claim_count,mean_total_claim_count,...,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply,drug_mean,drug_sum,drug_variance,spec_weight
1003516,1083983886,Khan,Majid,Willow Grove,PA,0.754521,0.641612,0.725886,0.588666,0.413010,...,0.0,0.0,17.0,0.0,0.0,0.0,2.937521,4.113144,2.629797e+06,0
790759,1154373793,Vadnerkar,Aniket,Paradise Valley,AZ,0.781625,0.672638,0.743618,0.594215,0.414405,...,0.0,0.0,0.0,0.0,0.0,0.0,2.487684,3.716801,1.609274e+05,0
618817,1477561322,Farley,Frank,Frankfort,KY,0.855995,0.686811,0.790740,0.712093,0.453877,...,0.0,0.0,99.0,2178.9,34.0,2390.0,3.437253,5.781487,1.173126e+08,46
248492,1992816854,Streeter,Michael,Omaha,NE,0.722039,0.619000,0.705911,0.532530,0.364399,...,0.0,0.0,0.0,0.0,0.0,0.0,2.577999,3.919325,2.003634e+05,14
222787,1891704037,Patel,Jagdish,Sycamore,IL,0.824713,0.698002,0.814565,0.657551,0.456501,...,0.0,0.0,0.0,0.0,0.0,0.0,3.136500,4.692494,7.031594e+06,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219635,1881132249,Shibla,Tracy,Palm Coast,FL,0.629477,0.526240,0.582803,0.524317,0.390024,...,0.0,0.0,16.0,0.0,0.0,0.0,1.713460,2.747148,1.115526e+03,14
455360,1821422320,Anim,Sandra,Pecos,TX,0.679621,0.588780,0.669629,0.501496,0.359842,...,0.0,0.0,0.0,0.0,0.0,0.0,2.166770,3.067264,4.264420e+04,0
237958,1952571341,Patel,Satya,Birch Run,MI,0.787445,0.633215,0.747187,0.650262,0.422718,...,0.0,0.0,13.0,0.0,0.0,0.0,3.002413,5.001985,5.918911e+06,46
158279,1639309750,Strawn,Morgan,Wailuku,HI,0.635801,0.559310,0.594965,0.553017,0.459170,...,0.0,0.0,0.0,0.0,0.0,0.0,2.486386,3.439367,7.590367e+04,11


In [291]:
pd_valid.fillna(0)


Unnamed: 0,npi,last_name,first_name,city,state,sum_total_drug_cost,mean_total_drug_cost,max_total_drug_cost,sum_total_claim_count,mean_total_claim_count,...,Alcohol Pads_total_claim_count,Alcohol Pads_total_day_supply,Prednisone_total_claim_count,Bupropion Xl_total_drug_cost,Bupropion Xl_total_claim_count,Bupropion Xl_total_day_supply,drug_mean,drug_sum,drug_variance,spec_weight
192441,1770533184,Desai,Roohi,Saint Louis,MO,0.827009,0.655544,0.778437,0.700458,0.452279,...,0.0,0.0,40.0,3001.95,75.0,5550.0,3.217416,5.504956,1.122920e+07,46
824802,1285720045,Davis,Annette,Louisville,KY,0.744363,0.654141,0.723123,0.521966,0.362663,...,0.0,0.0,0.0,0.00,0.0,0.0,2.085249,2.860416,1.523170e+04,14
191490,1760896906,Sydloski,Mitchell,Caledonia,MI,0.815084,0.666053,0.760702,0.657998,0.425156,...,0.0,0.0,24.0,1093.16,16.0,1380.0,3.066465,5.091402,4.991191e+06,46
815099,1245719731,Forstall,Lillian,Amite,LA,0.548135,0.509629,0.525210,0.388662,0.334729,...,0.0,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,0.000000e+00,0
37999,1154384212,Thomas,Jason,Ahoskie,NC,0.517327,0.449986,0.484994,0.442613,0.363275,...,0.0,0.0,0.0,0.00,0.0,0.0,1.988150,2.762566,6.220591e+03,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718925,1861723496,Deyoung,Derek,Raleigh,NC,0.693669,0.590891,0.657864,0.586015,0.450179,...,0.0,0.0,0.0,0.00,0.0,0.0,2.806352,4.237062,7.897721e+05,4
80794,1326111329,Neril,Morton,Orinda,CA,0.520011,0.520011,0.520011,0.317892,0.317892,...,0.0,0.0,0.0,0.00,0.0,0.0,2.261525,2.737057,2.581055e+04,6
969616,1861921397,Saridakis Phillippi,Ellen,Mc Murray,PA,0.688563,0.556770,0.632474,0.560201,0.375033,...,0.0,0.0,18.0,0.00,0.0,0.0,2.678544,4.154784,3.609742e+05,46
572359,1295046621,Maharaj-Prasad,Philip,Tucson,AZ,0.742486,0.607477,0.711578,0.590800,0.386706,...,0.0,0.0,24.0,0.00,0.0,0.0,2.461384,4.184185,2.085582e+05,46


In [292]:
numerical_features1 = numerical_features + ['drug_sum','spec_weight']


In [293]:
numerical_features1

['sum_total_drug_cost',
 'mean_total_drug_cost',
 'total_amount_of_payment_usd',
 'max_total_drug_cost',
 'sum_total_claim_count',
 'mean_total_claim_count',
 'max_total_claim_count',
 'sum_total_day_supply',
 'mean_total_day_supply',
 'max_total_day_supply',
 'claim_max_mean',
 'supply_max_mean',
 'drug_max_mean',
 'drug_sum',
 'spec_weight']

In [294]:
positives=len(pd_train[pd_train[target] == 1])

In [295]:
dataset_size=len(pd_train)

In [296]:
per_ones=(float(positives)/float(dataset_size))*100

In [297]:
negatives=float(dataset_size-positives)
t_ratio=negatives/positives

In [298]:
balancing_ratio= positives/dataset_size

In [299]:
print("positives:",positives)
print("dataset_size:",dataset_size)
print("per_ones:",per_ones)
print("t_ratio:",t_ratio)
print("balancing_ratio:",balancing_ratio)

positives: 182
dataset_size: 813934
per_ones: 0.022360535375104126
t_ratio: 4471.1648351648355
balancing_ratio: 0.00022360535375104125


In [300]:
pd_train = pd_train.fillna(0)

In [301]:
x= pd_train[numerical_features1].values

In [302]:
x= pd_train[numerical_features1].values
y = pd_train[target].values
clf =  LogisticRegression(C=1e5, class_weight={0:1, 1:4000}, n_jobs=3)
clf.fit(x,y)
y_p=clf.predict_proba(x)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [303]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

params_0 = {'n_estimators': 102, 'max_depth': 5, 'min_samples_split': 4, 'learning_rate': 0.02}
params_1 = {'n_estimators': 500, 'max_depth': 1, 'min_samples_split': 6, 'class_weight' : {0:1, 1:2514}, 'n_jobs':5}

scaler = StandardScaler()
    
clfs = [
    LogisticRegression(C=1e5,class_weight= {0:1, 1:2514}, n_jobs=5),
    
    GaussianNB(),

    
    DecisionTreeClassifier(**params_dt),
    
    ensemble.RandomForestClassifier(**params_1),

    ensemble.ExtraTreesClassifier(**params_1),
    
    ensemble.GradientBoostingClassifier(**params_0)
    
    
    ]

In [304]:
x_train = pd_train[numerical_features1].values

y_train = pd_train[target].values
    
x_train = scaler.fit_transform(x_train)

x_valid = pd_valid[numerical_features1].values
y_valid = pd_valid[target].values
x_valid_x= scaler.transform(x_valid)


In [305]:
predicted_probabilities = []
confusion_matrices = []
trained_classifiers = []

In [306]:

for classifier in clfs:
    print("%s:" %  classifier.__class__.__name__)

    # Train the classifier on the training data
    classifier.fit(x_train, y_train)
    trained_classifiers.append(classifier)

    # Predict on the validation set
    predictions = classifier.predict(x_valid_x)
    probability_positive = classifier.predict_proba(x_valid_x)[:, 1]
    predicted_probabilities.append(probability_positive)

    # Generate and print evaluation metrics
    confusion_matrix_result = confusion_matrix(y_valid, predictions)
    
    print(f"\tPrecision: {precision_score(y_valid, predictions):.5f}")
    print(f"\tF1 Score: {f1_score(y_valid, predictions):.5f}")
    print(f"\tAUC Score: {roc_auc_score(y_valid, probability_positive):.5f}")
    print(f"\tAccuracy: {accuracy_score(y_valid, predictions):.5f}\n")

    # Append confusion matrix results to a list for later use
    confusion_matrices.append(
        pd.DataFrame(confusion_matrix_result, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive'])
    )


LogisticRegression:
	Precision: 0.00069
	F1 Score: 0.00139
	AUC Score: 0.65266
	Accuracy: 0.86549

GaussianNB:
	Precision: 0.00000
	F1 Score: 0.00000
	AUC Score: 0.60010
	Accuracy: 0.99962

DecisionTreeClassifier:
	Precision: 0.00000
	F1 Score: 0.00000
	AUC Score: 0.54680
	Accuracy: 0.99971

RandomForestClassifier:
	Precision: 0.00028
	F1 Score: 0.00055
	AUC Score: 0.62130
	Accuracy: 0.98209

ExtraTreesClassifier:


  _warn_prf(average, modifier, msg_start, len(result))


	Precision: 0.00000
	F1 Score: 0.00000
	AUC Score: 0.62015
	Accuracy: 0.99971

GradientBoostingClassifier:
	Precision: 0.01587
	F1 Score: 0.01653
	AUC Score: 0.63500
	Accuracy: 0.99942



In [314]:
fpr, tpr, thresholds = roc_curve(y_valid, prob_result[2])


NameError: name 'prob_result' is not defined

In [None]:
fpr, tpr, thresholds = roc_curve(y_valid, prob_result[2])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % roc_auc)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
