# The Decision Tree on the Churn Dataset in Spark

In [65]:
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier

## Select the churn file 

In [66]:
inputFile = "../data/churn.csv"

## Create the Spark Session 

In [67]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("ChurnDecisionTree")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   

## Data Preparation
### Transform labels into index

In [68]:
df.printSchema()

labelIndexer = StringIndexer().setInputCol("LEAVE").setOutputCol("label").fit(df)
collegeIndexer = StringIndexer().setInputCol("COLLEGE").setOutputCol("COLLEGE_NUM").fit(df)
# TODO add additional indexer for string attributes


root
 |-- COLLEGE: string (nullable = true)
 |-- INCOME: integer (nullable = true)
 |-- OVERAGE: integer (nullable = true)
 |-- LEFTOVER: integer (nullable = true)
 |-- HOUSE: integer (nullable = true)
 |-- HANDSET_PRICE: integer (nullable = true)
 |-- OVER_15MINS_CALLS_PER_MONTH: integer (nullable = true)
 |-- AVERAGE_CALL_DURATION: integer (nullable = true)
 |-- REPORTED_SATISFACTION: string (nullable = true)
 |-- REPORTED_USAGE_LEVEL: string (nullable = true)
 |-- CONSIDERING_CHANGE_OF_PLAN: string (nullable = true)
 |-- LEAVE: string (nullable = true)



 ### Build the feature vector

In [69]:
featureCols = df.columns.copy()
featureCols.remove("LEAVE")
featureCols.remove("COLLEGE")
featureCols.remove("REPORTED_SATISFACTION")
featureCols.remove("REPORTED_USAGE_LEVEL")
featureCols.remove("CONSIDERING_CHANGE_OF_PLAN")
featureCols = featureCols +["COLLEGE_NUM"]
print(featureCols)
# TODO add additinal columns to feature vector

['INCOME', 'OVERAGE', 'LEFTOVER', 'HOUSE', 'HANDSET_PRICE', 'OVER_15MINS_CALLS_PER_MONTH', 'AVERAGE_CALL_DURATION', 'COLLEGE_NUM']


### Build the feature Vector Assembler

In [70]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

### Convert indexed labels back to original labels

In [71]:
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

## Do the Data Preparation

In [72]:
labeledData = labelIndexer.transform(df)
# TODO add the other additional indexer
indexedLabedData = collegeIndexer.transform(labeledData)
labeledPointData = assembler.transform(indexedLabedData)


### Spliting the dataset into train and test set

In [73]:
splits = labeledPointData.randomSplit([0.6, 0.4 ], 1234)
train = splits[0]
test = splits[1]

## Build the decision tree model

In [74]:
# TODO Optimize the properties 
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="entropy")
dtModel = dt.fit(train)


## Build an evaluator

In [75]:
evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="prediction", metricName="areaUnderROC")

## Do the prediction 

In [76]:
predictions = dtModel.transform(test)
predictionsConverted = predConverter.transform(predictions)

## Evaluate / Test the Model 

In [77]:
predictionsConverted.select("prediction", "label", "predictedLabel", "LEAVE", "features").show()
# Select (prediction, true label) and compute test error.
   
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

+----------+-----+--------------+-----+--------------------+
|prediction|label|predictedLabel|LEAVE|            features|
+----------+-----+--------------+-----+--------------------+
|       0.0|  1.0|          STAY|LEAVE|[20007.0,36.0,23....|
|       1.0|  1.0|         LEAVE|LEAVE|[20009.0,183.0,18...|
|       1.0|  1.0|         LEAVE|LEAVE|[20012.0,246.0,9....|
|       1.0|  0.0|         LEAVE| STAY|[20063.0,58.0,0.0...|
|       1.0|  0.0|         LEAVE| STAY|(8,[0,3,4,6],[200...|
|       0.0|  0.0|          STAY| STAY|[20078.0,199.0,65...|
|       0.0|  0.0|          STAY| STAY|(8,[0,3,4,6],[200...|
|       0.0|  0.0|          STAY| STAY|[20278.0,0.0,69.0...|
|       0.0|  0.0|          STAY| STAY|[20284.0,0.0,5.0,...|
|       1.0|  1.0|         LEAVE|LEAVE|[20288.0,0.0,0.0,...|
|       0.0|  1.0|          STAY|LEAVE|[20317.0,85.0,0.0...|
|       1.0|  1.0|         LEAVE|LEAVE|(8,[0,3,4,6],[203...|
|       0.0|  1.0|          STAY|LEAVE|[20326.0,88.0,0.0...|
|       0.0|  0.0|      

In [78]:
spark.stop()