# Abalone Dataset

The original file and the description can be found here <https://archive.ics.uci.edu/ml/datasets/Abalone>

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

In [None]:

input_file = translate_to_file_string("../../data/abalone.csv")


In [None]:
spark = (SparkSession
       .builder   
       .master("local[*]")
       .appName("Abalone DataSet App")
       .getOrCreate())

In [None]:
# load data file.
# create a DataFrame using an infered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(input_file) \
       .withColumn("Young", (col("Rings")<10.0).cast('int')) # additional label for binary classification
df.printSchema()

In [None]:
#transform string attribute
sex_indexer = StringIndexer().setInputCol("Sex").setOutputCol("Sex_num").fit(df)

In [None]:
#feature columns
# Rings = label 
# predict the continuous value or the class (classification problem)
# Young = label for binary classification
featureCols = df.columns.copy()
featureCols.remove("Rings")
featureCols.remove("Sex")
featureCols.remove("Young")
featureCols = featureCols + ["Sex_num"]

In [None]:
#vector assembler of all features
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [None]:
labeled_point_ds = assembler.transform(sex_indexer.transform(df))
print_df(labeled_point_ds,10)

In [None]:
#split data for testing
splits = labeled_point_ds.randomSplit([0.6, 0.4 ], 5756)
train = splits[0]
test = splits[1]

In [None]:
spark.stop()