# Heart Disease Dataset

The original file can be found here <http://archive.ics.uci.edu/ml/datasets/Heart+Disease>

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType, IntegerType
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

In [None]:
input_file = translate_to_file_string("../../data/heart.csv")

In [None]:
spark = (SparkSession
       .builder   
       .master("local[*]")
       .appName("HeartDisease")
       .getOrCreate())

In [None]:
# load data file.
# create a DataFrame using an infered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(input_file) 
df.printSchema()

In [None]:
#remove the outliner
df_filtered=df.filter(df.age > 30)

In [None]:
#feature columns
featureCols = df_filtered.columns.copy()
featureCols.remove("output")

In [None]:
#vector assembler of all features
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [None]:
labeled_point_ds = assembler.transform(df_filtered)
print_df(labeled_point_ds,10)

In [None]:
#split data for testing
splits = labeled_point_ds.randomSplit([0.6, 0.4 ], 5756)
train = splits[0]
test = splits[1]

In [None]:
spark.stop()