# Mushroom Dataset 

Description and data can be found here <https://archive.ics.uci.edu/ml/datasets/mushroom>. 

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import expr
from pyspark.sql.session import SparkSession
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

In [None]:
input_file = translate_to_file_string("../../data/mushrooms.csv")

In [None]:
spark = (SparkSession
       .builder
       .master("local[*]")
       .appName("Mushroom Data Preparation")
       .getOrCreate())

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(input_file)
df.printSchema()

# String- Werte in numerische Werte umwandeln

In [None]:
labelIndexer        = StringIndexer().setInputCol("class").setOutputCol("label").fit(df)
cap_shapeIndexer    = StringIndexer().setInputCol("cap-shape").setOutputCol("cap_shapeNUM").fit(df)
cap_surfaceIndexer  = StringIndexer().setInputCol("cap-surface").setOutputCol("cap_surfaceNUM").fit(df)
cap_colorIndexer    = StringIndexer().setInputCol("cap-color").setOutputCol("cap_colorNUM").fit(df)
bruisesIndexer      = StringIndexer().setInputCol("bruises").setOutputCol("bruisesNUM").fit(df)
odorIndexer         = StringIndexer().setInputCol("odor").setOutputCol("odorNUM").fit(df)
gill_attachmentIndexer  = StringIndexer().setInputCol("gill-attachment").setOutputCol("gill_attachmentNUM").fit(df)
gill_spacingIndexer     = StringIndexer().setInputCol("gill-spacing").setOutputCol("gill_spacingNUM").fit(df)
gill_sizeIndexer        = StringIndexer().setInputCol("gill-size").setOutputCol("gill_sizeNUM").fit(df)
gill_colorIndexer       = StringIndexer().setInputCol("gill-color").setOutputCol("gill_colorNUM").fit(df)
stalk_shapeIndexer      = StringIndexer().setInputCol("stalk-shape").setOutputCol("stalk_shapeNUM").fit(df)
stalk_rootIndexer       = StringIndexer().setInputCol("stalk-root").setOutputCol("stalk_rootNUM").fit(df)
stalk_surface_above_ringIndexer  = StringIndexer().setInputCol("stalk-surface-above-ring").setOutputCol("stalk_surface_above_ringNUM").fit(df)
stalk_surface_below_ringIndexer  = StringIndexer().setInputCol("stalk-surface-below-ring").setOutputCol("stalk_surface_below_ringNUM").fit(df)
stalk_color_above_ringIndexer    = StringIndexer().setInputCol("stalk-color-above-ring").setOutputCol("stalk_color_above_ringNUM").fit(df)
stalk_color_below_ringIndexer    = StringIndexer().setInputCol("stalk-color-below-ring").setOutputCol("stalk_color_below_ringNUM").fit(df)
veil_typeIndexer            = StringIndexer().setInputCol("veil-type").setOutputCol("veil_typeNUM").fit(df)
veil_colorIndexer           = StringIndexer().setInputCol("veil-color").setOutputCol("veil_colorNUM").fit(df)
ring_numberIndexer          = StringIndexer().setInputCol("ring-number").setOutputCol("ring_numberNUM").fit(df)
ring_typeIndexer            = StringIndexer().setInputCol("ring-type").setOutputCol("ring_typeNUM").fit(df)
spore_print_colorIndexer    = StringIndexer().setInputCol("spore-print-color").setOutputCol("spore_print_colorNUM").fit(df)
populationIndexer           = StringIndexer().setInputCol("population").setOutputCol("populationNUM").fit(df)
habitatIndexer              = StringIndexer().setInputCol("habitat").setOutputCol("habitatNUM").fit(df)

Select only 7 features for the feature vector

Label = class (eatable or poisonous)

In [None]:
featureCols = ['odorNUM', 'gill_sizeNUM', 'gill_colorNUM', 'stalk_rootNUM', 'ring_typeNUM', 'spore_print_colorNUM', 'populationNUM']
print(featureCols)

In [None]:
labeled_data_num = cap_shapeIndexer.transform(
                    cap_surfaceIndexer.transform(
                    cap_colorIndexer.transform(
                    bruisesIndexer.transform(
                    odorIndexer.transform(
                    gill_attachmentIndexer.transform(
                    gill_spacingIndexer.transform(
                    gill_sizeIndexer.transform(
                    gill_colorIndexer.transform(
                    stalk_shapeIndexer.transform(
                    stalk_rootIndexer.transform(
                    stalk_surface_above_ringIndexer.transform(
                    stalk_surface_below_ringIndexer.transform(
                    stalk_color_above_ringIndexer.transform(
                    stalk_color_below_ringIndexer.transform(
                    veil_colorIndexer.transform(
                    veil_typeIndexer.transform(
                    ring_numberIndexer.transform(
                    ring_typeIndexer.transform(
                    spore_print_colorIndexer.transform(
                    populationIndexer.transform(
                    habitatIndexer.transform(
                    labelIndexer.transform(df)))))))))))))))))))))))
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
labeled_point_ds = assembler.transform(labeled_data_num)
print_df(labeled_data_num,10)

In [None]:
splits = labeled_point_ds.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

In [None]:
spark.stop()