# The Decision Tree on the Churn Dataset in Spark

In [1]:
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

## Select the churn file 

In [2]:
inputFile = "../data/churn.csv"

## Create the Spark Session 

In [3]:
#create a SparkSession
spark = SparkSession \
       .builder \
       .appName("ChurnDecisionTree")\
       .getOrCreate()
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   

## Data Preparation
### Transform labels into index

In [4]:
labelIndexer = StringIndexer().setInputCol("LEAVE").setOutputCol("label").fit(df)
collegeIndexer = StringIndexer().setInputCol("COLLEGE").setOutputCol("COLLEGE_NUM").fit(df)
satIndexer = StringIndexer().setInputCol("REPORTED_SATISFACTION").setOutputCol("REPORTED_SATISFACTION_NUM").fit(df)
usageIndexer = StringIndexer().setInputCol("REPORTED_USAGE_LEVEL").setOutputCol("REPORTED_USAGE_LEVEL_NUM").fit(df)
changeIndexer = StringIndexer().setInputCol("CONSIDERING_CHANGE_OF_PLAN").setOutputCol("CONSIDERING_CHANGE_OF_PLAN_NUM").fit(df)

 ### Build the feature vector

In [5]:
featureCols = df.columns.copy()
featureCols.remove("LEAVE")
featureCols.remove("COLLEGE")
featureCols.remove("REPORTED_SATISFACTION")
featureCols.remove("REPORTED_USAGE_LEVEL")
featureCols.remove("CONSIDERING_CHANGE_OF_PLAN")
featureCols = featureCols +["COLLEGE_NUM","REPORTED_SATISFACTION_NUM","REPORTED_USAGE_LEVEL_NUM","CONSIDERING_CHANGE_OF_PLAN_NUM"]

### Build the feature Vector Assembler

In [6]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

### Convert indexed labels back to original labels

In [7]:
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

## Do the Data Preparation

In [8]:
labeledData = labelIndexer.transform(df)
indexedLabedData = collegeIndexer.transform(satIndexer.transform(usageIndexer.transform(changeIndexer.transform(labeledData))))
labeledPointData = assembler.transform(indexedLabedData)


### Spliting the dataset into train and test set

In [9]:
splits = labeledPointData.randomSplit([0.6, 0.4 ], 1234)
train = splits[0]
test = splits[1]

## Build the decision tree model

In [10]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dtModel = dt.fit(train)


## Build an evaluator

In [11]:
evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="prediction", metricName="areaUnderROC")

## Do the prediction 

In [12]:
predictions = dtModel.transform(test)
predictionsConverted = predConverter.transform(predictions)

## Evaluate / Test the Model 

In [13]:
predictionsConverted.select("prediction", "label", "predictedLabel", "LEAVE", "features").show()
# Select (prediction, true label) and compute test error.
   
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

+----------+-----+--------------+-----+--------------------+
|prediction|label|predictedLabel|LEAVE|            features|
+----------+-----+--------------+-----+--------------------+
|       0.0|  1.0|          STAY|LEAVE|[20007.0,36.0,23....|
|       1.0|  1.0|         LEAVE|LEAVE|[20009.0,183.0,18...|
|       1.0|  1.0|         LEAVE|LEAVE|[20012.0,246.0,9....|
|       1.0|  0.0|         LEAVE| STAY|[20063.0,58.0,0.0...|
|       1.0|  0.0|         LEAVE| STAY|(11,[0,3,4,6,8,10...|
|       0.0|  0.0|          STAY| STAY|[20078.0,199.0,65...|
|       0.0|  0.0|          STAY| STAY|(11,[0,3,4,6,8],[...|
|       0.0|  0.0|          STAY| STAY|[20278.0,0.0,69.0...|
|       0.0|  0.0|          STAY| STAY|[20284.0,0.0,5.0,...|
|       1.0|  1.0|         LEAVE|LEAVE|[20288.0,0.0,0.0,...|
|       0.0|  1.0|          STAY|LEAVE|[20317.0,85.0,0.0...|
|       1.0|  1.0|         LEAVE|LEAVE|[20320.0,0.0,0.0,...|
|       0.0|  1.0|          STAY|LEAVE|[20326.0,88.0,0.0...|
|       0.0|  0.0|      

In [14]:
spark.stop()