# Bank Dataset

https://archive.ics.uci.edu/ml/datasets/bank+marketing

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

In [None]:
input_file = translate_to_file_string("../../data/bank.csv")


In [None]:
spark = (SparkSession
       .builder   
       .master("local[*]")
       .appName("Bank DataSet SQL")
       .getOrCreate())

In [None]:
# load data file.
# create a DataFrame
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(input_file) \
       .withColumn("default",col("default").cast(BooleanType())) \
       .withColumn("housing",col("housing").cast(BooleanType())) \
       .withColumn("loan",col("loan").cast(BooleanType())) \
       .withColumn("y",col("y").cast(BooleanType()))
df.printSchema()

In [None]:
#transform string attributes
job_indexer = StringIndexer().setInputCol("job").setOutputCol("job_num").fit(df)
marital_indexer = StringIndexer().setInputCol("marital").setOutputCol("marital_num").fit(df)
education_indexer = StringIndexer().setInputCol("education").setOutputCol("education_num").fit(df)
contact_indexer = StringIndexer().setInputCol("contact").setOutputCol("contact_num").fit(df)
poutcome_indexer = StringIndexer().setInputCol("poutcome").setOutputCol("poutcome_num").fit(df)
df = job_indexer.transform(marital_indexer.transform(
                           education_indexer.transform(
                           contact_indexer.transform(
                           poutcome_indexer.transform(df)))))
df.printSchema()

In [None]:
#feature columns
# y = label 
featureCols = ["age","job_num","marital_num","education_num","default","balance","housing","loan","contact_num",
               "duration","campaign","pdays","previous","poutcome_num"]
#vector assembler of all features
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

labeled_point_ds = assembler.transform(df)
print_df(labeled_point_ds,10)

In [None]:
#split data for testing
splits = labeled_point_ds.randomSplit([0.6, 0.4 ], 5756)
train = splits[0]
test = splits[1]

In [None]:
spark.stop()