In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Build the environment

In [None]:
!pip install pyspark

In [None]:
!ls

In [None]:
!pwd

## 2. Import the library

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
sc = SparkContext()

spark = SparkSession.builder \
    .appName('Mushroom') \
    .getOrCreate()

In [None]:
mushrooms = spark.read.csv('/kaggle/input/mushroom-classification/mushrooms.csv', header=True)

In [None]:
in_cols = mushrooms.schema.names[1:]
# 创建了Indexer后为之后的训练提供参照
str_indexers = [StringIndexer(inputCol=c, outputCol=c+'_idx') for c in in_cols]

# a list of StringIndexer objects to convert strings to integer indices
# each indexer is responsible for converting one feature column
'''
in_cols:
 id | category
----|----------
 0  | a
 1  | b
 2  | c
 3  | d
 4  | e
 
str_indexer
 id | category | category_idx
----|----------|---------------
 0  | a        | 0.0
 1  | b        | 2.0
 2  | c        | 1.0
 3  | d        | 3.0
 4  | e        | 3.0
'''

In [None]:
# 先进行StringIndexer，再对数据创建onehot
onehot_encoders = [OneHotEncoder(dropLast=False, inputCol=c+'_idx', outputCol=c+'_onehot') for c in in_cols]
# a list of OneHotEncoder objects to convert integer indices of cat levels to one-hot encoded columns
# each encoder is responsible fore encoding one feature column

onehot_cols = [c+'_onehot' for c in in_cols]
print(onehot_cols[0:4])

In [None]:
# 把所有的one-hot编码全部都集成到一个feature的1*n矩阵中
feat_assembler = VectorAssembler(inputCols=onehot_cols, outputCol='features')
# a VectorAssembler object that assembles all the one-hot encoded columns into one column,
# each row of which is a vector of all the numbers in those one-hot columns.
# e.g.
# +-----+-----+-----+-----+---------------------+
# |cat_0|cat_1|cat_2|cat_3|             features|
# +-----+-----+-----+-----+---------------------+
# |    1|    0|    0|    0| [1.0, 0.0, 0.0, 0.0]|
# |    0|    1|    0|    0| [0.0, 1.0, 0.0, 0.0]|
# |    0|    0|    0|    1| [0.0, 0.0, 0.0, 1.0]|
# +-----+-----+-----+-----+---------------------+

In [None]:
label_indexer = StringIndexer(inputCol=mushrooms.schema.names[0], outputCol='poisonous')
# a StringIndexer object that converts <class> column's {e, p} to {0, 1}
# Because there are more 'e' class in the sample, it will be encoded 0, since StringIndexer gives more frequent levels a lower index
# Run `mushrooms.groupby('class').count().show()` in pyspark shell to see counts of each class
mushrooms.groupby('class').count().show()
mushrooms.groupby('cap-shape').count().show()

In [None]:
pipeline = Pipeline(stages=str_indexers+onehot_encoders+[feat_assembler, label_indexer])
# +------------+---------------+--------------+-------------+---------------------+
# |str_indexers|onehot_encoders|feat_assembler|label_indexer|             features|
# +------------+---------------+--------------+-------------+---------------------+
# |           1|              0|             0|            0| [1.0, 0.0, 0.0, 0.0]|
# |           0|              1|             0|            0| [0.0, 1.0, 0.0, 0.0]|
# |           0|              0|             0|            1| [0.0, 0.0, 0.0, 1.0]|
# +------------+---------------+--------------+-------------+---------------------+

# Use the pipeline object to transform our dataframe
mushrooms_trans = pipeline \
                    .fit(mushrooms) \
                    .transform(mushrooms)

mushrooms_trans = mushrooms_trans.withColumnRenamed('poisonous', 'label')

mushrooms_train, mushrooms_val = mushrooms_trans.randomSplit([0.05, 0.95], seed=2021)

model = RandomForestClassifier(labelCol = 'label', featuresCol = 'features',numTrees=10)

#pipeline = Pipeline(stages=str_indexers+onehot_encoders+[feat_assembler, label_indexer, model])

ppl = Pipeline(stages=[model])

paramGrid = ParamGridBuilder()\
    .addGrid(model.maxDepth, [6, 8, 10]) \
    .addGrid(model.numTrees, [100,200])\
    .addGrid(model.featureSubsetStrategy, ['onethird', 'sqrt', 'log2']).build()

crossval = CrossValidator(estimator=ppl,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2) 

model = crossval.fit(mushrooms_train)

In [None]:
pred = model.transform(mushrooms_val)
pred.select("probability","prediction","label").show()

In [None]:
results = pred.select(['probability', 'prediction', 'label'])
# Select the columns relevant for evaluation
# `results` looks like this:
# +--------------------+----------+---------+
# |         probability|prediction|poisonous|
# +--------------------+----------+---------+
# |[0.97024593961675...|       0.0|      0.0|
# |[0.96303265951929...|       0.0|      0.0|
# |[0.95909221894651...|       0.0|      0.0|
# |[0.95958294573868...|       0.0|      0.0|
# |[0.95580449199223...|       0.0|      0.0|
# +--------------------+----------+---------+

results_collect = results.collect()
# After .collect(), `results_collect` become a list of Row objects

correct = results.withColumn('correct', (results.prediction==results.label).cast('integer')).select('correct')

accuracy = correct.agg({'correct':'mean'}).collect()[0][0]

print('Test accuracy:', accuracy)

In [None]:
# 显示最优参数组合
parameters = [
    (
        [
            {key.name: paramValue} 
            for key, paramValue 
            in zip(
                params.keys(), 
                params.values())
        ], metric
    ) 
    for params, metric 
    in zip(
        model.getEstimatorParamMaps(), 
        model.avgMetrics
    )
]
print(sorted(parameters, key=lambda el: el[1], reverse=True)[0])