In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Medical Appointment - No shows').getOrCreate()

In [2]:
noshow  = spark.read.csv('KaggleV2-May-2016.csv', inferSchema = True, header = True)

In [3]:
noshow.printSchema()

root
 |-- PatientId: double (nullable = true)
 |-- AppointmentID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- ScheduledDay: timestamp (nullable = true)
 |-- AppointmentDay: timestamp (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Neighbourhood: string (nullable = true)
 |-- Scholarship: integer (nullable = true)
 |-- Hipertension: integer (nullable = true)
 |-- Diabetes: integer (nullable = true)
 |-- Alcoholism: integer (nullable = true)
 |-- Handcap: integer (nullable = true)
 |-- SMS_received: integer (nullable = true)
 |-- No-show: string (nullable = true)



In [4]:
noshow.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
PatientId,110527,1.474962657103957E14,2.560949202917392E14,39217.84439,9.99981631772427E14
AppointmentID,110527,5675305.123426855,71295.75153968473,5030230,5790484
Gender,110527,,,F,M
Age,110527,37.08887421173107,23.110204963682648,-1,115
Neighbourhood,110527,,,AEROPORTO,VILA RUBIM
Scholarship,110527,0.09826558216544373,0.2976747541093073,0,1
Hipertension,110527,0.1972459218109602,0.3979213499470851,0,1
Diabetes,110527,0.07186479321794674,0.2582650735074665,0,1
Alcoholism,110527,0.030399811810688793,0.17168555541424446,0,1


In [5]:
#Drop redundant variables
drop_list = ['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay']
noshow = noshow.select([column for column in noshow.columns if column not in drop_list])
noshow.show(5)

+------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+
|Gender|Age|    Neighbourhood|Scholarship|Hipertension|Diabetes|Alcoholism|Handcap|SMS_received|No-show|
+------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+
|     F| 62|  JARDIM DA PENHA|          0|           1|       0|         0|      0|           0|     No|
|     M| 56|  JARDIM DA PENHA|          0|           0|       0|         0|      0|           0|     No|
|     F| 62|    MATA DA PRAIA|          0|           0|       0|         0|      0|           0|     No|
|     F|  8|PONTAL DE CAMBURI|          0|           0|       0|         0|      0|           0|     No|
|     F| 56|  JARDIM DA PENHA|          0|           1|       1|         0|      0|           0|     No|
+------+---+-----------------+-----------+------------+--------+----------+-------+------------+-------+
only showing top 5 rows



In [6]:
noshow.columns

['Gender',
 'Age',
 'Neighbourhood',
 'Scholarship',
 'Hipertension',
 'Diabetes',
 'Alcoholism',
 'Handcap',
 'SMS_received',
 'No-show']

In [7]:
#
noshow = noshow.na.drop()

In [8]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler


In [9]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="No-show", outputCol="label")

indexed = label_stringIdx.fit(noshow).transform(noshow)
noshow = indexed

In [10]:
gender_index = StringIndexer(inputCol='Gender',outputCol='GenderIndex')
gender_encode = OneHotEncoder(inputCol='GenderIndex',outputCol='GenderClassVec')

In [11]:
Neighbourhood_index = StringIndexer(inputCol='Neighbourhood',outputCol='NeighbourhoodIndex')
Neighbourhood_encode = OneHotEncoder(inputCol='NeighbourhoodIndex',outputCol='NeighbourhoodClassVec')

In [12]:
from pyspark.ml.feature import StandardScaler


In [13]:
# Transform all features into a vector using VectorAssembler
numericCols = ["Age", "Scholarship", "Hipertension", "Diabetes", "Alcoholism", "Handcap","SMS_received"]

assemblerInputs = ["GenderClassVec" , "NeighbourhoodClassVec"] + numericCols

assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [14]:
standardScaler = StandardScaler(inputCol="features", outputCol="scaled_features")


In [15]:
from pyspark.ml.classification import GBTClassifier


In [16]:
from pyspark.ml import Pipeline

In [17]:
gbt = GBTClassifier(featuresCol='scaled_features',labelCol='label')


In [18]:
pipeline = Pipeline(stages=[gender_index,Neighbourhood_index,
                           gender_encode,Neighbourhood_encode,
                           assembler,standardScaler, gbt])

In [19]:
train, test = noshow.randomSplit([0.7, 0.3], seed = 42)


In [20]:
fit_model = pipeline.fit(train)

In [21]:
results = fit_model.transform(test)

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='label')

In [24]:
results.select('label','prediction').show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  1.0|       0.0|
|  1.0|       0.0|
|  1.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [25]:
AUC = my_eval.evaluate(results)

In [26]:
AUC

0.5005191090179306