# Churn Dataset Correlation Calculation in Spark

In [None]:
import sys
sys.path.append("..")
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.stat import Correlation, ChiSquareTest
from pyspark.sql import SparkSession


## Select the churn file 

In [None]:
inputFile = translate_to_file_string("../data/churn.csv")

## Create the Spark Session 

In [None]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("Churn Proprocessing")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   

 ### Build the feature vector

In [None]:
featureCols = ["INCOME", "OVERAGE", "LEFTOVER", "HOUSE", "HANDSET_PRICE", "OVER_15MINS_CALLS_PER_MONTH",  "AVERAGE_CALL_DURATION"]

### Build the feature Vector Assembler

In [None]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

## Do the Data Preparation

In [None]:
labeledPointData = assembler.transform(df)
labeledPointData.show()

### As formated output

In [None]:
print_df(labeledPointData.limit(10))

In [None]:
r1_matrix = Correlation.corr(labeledPointData, "features").collect()[0][0]
corr_matrix = r1_matrix.toArray().tolist()
df_corr_matrix = spark.createDataFrame(corr_matrix,featureCols)

print_df(df_corr_matrix)

In [None]:
spark.stop()