# The Decision Tree on the Churn Dataset in Spark

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from helpers.path_translation import translate_to_file_string

## Select the churn file 

In [None]:
inputFile = translate_to_file_string("../data/churn.csv")

## Create the Spark Session 

In [None]:
#create a SparkSession
spark = SparkSession \
       .builder \
       .appName("ChurnDecisionTree")\
       .getOrCreate()
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   

## Data Preparation
### Transform labels into index

In [None]:
labelIndexer = StringIndexer().setInputCol("LEAVE").setOutputCol("label").fit(df)
collegeIndexer = StringIndexer().setInputCol("COLLEGE").setOutputCol("COLLEGE_NUM").fit(df)
satIndexer = StringIndexer().setInputCol("REPORTED_SATISFACTION").setOutputCol("REPORTED_SATISFACTION_NUM").fit(df)
usageIndexer = StringIndexer().setInputCol("REPORTED_USAGE_LEVEL").setOutputCol("REPORTED_USAGE_LEVEL_NUM").fit(df)
changeIndexer = StringIndexer().setInputCol("CONSIDERING_CHANGE_OF_PLAN").setOutputCol("CONSIDERING_CHANGE_OF_PLAN_NUM").fit(df)

 ### Build the feature vector

In [None]:
featureCols = df.columns.copy()
featureCols.remove("LEAVE")
featureCols.remove("COLLEGE")
featureCols.remove("REPORTED_SATISFACTION")
featureCols.remove("REPORTED_USAGE_LEVEL")
featureCols.remove("CONSIDERING_CHANGE_OF_PLAN")
#featureCols = featureCols +["COLLEGE_NUM","REPORTED_SATISFACTION_NUM","REPORTED_USAGE_LEVEL_NUM","CONSIDERING_CHANGE_OF_PLAN_NUM"]

### Build the feature Vector Assembler

In [None]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

### Convert indexed labels back to original labels

In [None]:
predConverter = IndexToString(inputCol="prediction",outputCol="predictedLabel",labels=labelIndexer.labels)

## Do the Data Preparation

In [None]:
labeledData = labelIndexer.transform(df)
indexedLabedData = collegeIndexer.transform(satIndexer.transform(usageIndexer.transform(changeIndexer.transform(labeledData))))
labeledPointData = assembler.transform(indexedLabedData)


### Spliting the dataset into train and test set

In [None]:
splits = labeledPointData.randomSplit([0.6, 0.4 ], 1234)
train = splits[0]
test = splits[1]

## Build the decision tree model

In [None]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=12, impurity='entropy')
dtModel = dt.fit(train)


## Build an evaluator

In [None]:
evaluator =  BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="rawPrediction", metricName="areaUnderROC")

## Do the prediction 

In [None]:
predictions = dtModel.transform(test)
predictionsConverted = predConverter.transform(predictions)

## Evaluate / Test the Model 

In [None]:
predictionsConverted.select("prediction", "label", "predictedLabel", "LEAVE", "features").show()
# Select (prediction, true label) and compute test error.
   
accuracy = evaluator.evaluate(predictions)
print("Test Error = " ,(1.0 - accuracy))

In [None]:
spark.stop()