# Churn Data Preprocessing

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

## Select the churn file 

In [None]:
inputFile = "../../data/churn.csv"

## Create the Spark Session 

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   
print(df.printSchema())

## Data Preparation
### Transform strings into numeric attributes

In [None]:
labelIndexer = StringIndexer().setInputCol("LEAVE").setOutputCol("label").fit(df)
# TODO add Indexer for the other String attributes

 ### Build the feature vector

In [None]:
# TODO add the additional attributes in the feature vector
featureCols = df.columns.copy()
featureCols.remove("LEAVE")
featureCols.remove("COLLEGE")
featureCols.remove("REPORTED_SATISFACTION")
featureCols.remove("REPORTED_USAGE_LEVEL")
featureCols.remove("CONSIDERING_CHANGE_OF_PLAN")
print(featureCols)

### Build the feature Vector Assembler

In [None]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

## Do the Data Preparation

In [None]:
labeledData = labelIndexer.transform(df)
#TODO transform the data with the other indexer 
labeledPointData = assembler.transform(labeledData)
labeledPointData.show()

### Show result

In [None]:
print_df(labeledPointData,10)

In [None]:
spark.stop()