## Section 2: ML Workflows

In [0]:
%run "./Includes/Classroom-Setup"

In [0]:
file_path = f"{DA.paths.datasets}/airbnb/sf-listings/sf-listings-2019-03-06.csv"
df = spark.read.csv(file_path, header="true", inferSchema="true", multiLine="true", escape='"')

from pyspark.sql.functions import col, translate
fixed_price_df = df.withColumn("price", translate(col("price"), "$,", "").cast("double"))

### Exploratory Data Analysis

In [0]:
# Compute summary statistics on a Spark DataFrame using .summary()
display(df.summary())

In [0]:
# Compute summary statistics on a Spark DataFrame using dbutils data summaries.
dbutils.data.summarize(fixed_price_df)

In [0]:
# Remove outliers from a Spark DataFrame that are beyond or less than a designated threshold.
threshhold = 0
display(fixed_price_df.filter(col("price") > threshhold))

### Feature Engineering

In [0]:
# Identify why it is important to add indicator variables for missing values that have been imputed or replaced.

# If you do ANY imputation techniques for categorical/numerical features, you MUST include an additional field specifying that field was imputed.

In [0]:
# Describe when replacing missing values with the mode value is an appropriate way to handle missing values.

# カテゴリ変数の場合

# mean: 平均
# median: 中央値
# mode: 最頻値
# constant: 定数

In [0]:
# Compare and contrast imputing missing values with the mean value or median value.

display(fixed_price_df.select('price'))

# 歪んでいる場合、外れ値がある場合に有効

In [0]:
# Impute missing values with the mean or median value.
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType

integer_columns = [x.name for x in fixed_price_df.schema.fields if x.dataType == IntegerType()]
doubles_df = fixed_price_df
for c in integer_columns:
    doubles_df = doubles_df.withColumn(c, col(c).cast("double"))
columns = "\n - ".join(integer_columns)

impute_cols = [
    "bedrooms",
    "bathrooms",
    "beds", 
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value"
]

for c in impute_cols:
    doubles_df = doubles_df.withColumn(c + "_na", when(col(c).isNull(), 1.0).otherwise(0.0))
imputer = Imputer(strategy="median", inputCols=impute_cols, outputCols=impute_cols) #estimator

imputer_model = imputer.fit(doubles_df) #transformer
imputed_df = imputer_model.transform(doubles_df)

In [0]:
# Describe the process of one-hot encoding categorical features.
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

file_path = f"{DA.paths.datasets}/airbnb/sf-listings/sf-listings-2019-03-06-clean.delta/"
df = spark.read.format("delta").load(file_path)
train_df, test_df = df.randomSplit([.8, .2], seed=42)

categorical_cols = [field for (field, dataType) in train_df.dtypes if dataType == "string"]
index_output_cols = [x + "Index" for x in categorical_cols]
ohe_output_cols = [x + "OHE" for x in categorical_cols]
numeric_cols = [field for (field, dataType) in train_df.dtypes if ((dataType == "double") & (field != "price"))]
assembler_inputs = ohe_output_cols + numeric_cols

# step1: string indexer
string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid="skip")

# step2: ohe
ohe_encoder = OneHotEncoder(inputCols=index_output_cols, outputCols=ohe_output_cols)

# step3: vector assembler
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# step4: transform one by one or wrap estimators in pipeline and fit
stages = [string_indexer, ohe_encoder, vec_assembler]
pipeline = Pipeline(stages=stages)

pipeline_model = pipeline.fit(train_df)

In [0]:
# Describe why one-hot encoding categorical features can be inefficient for tree-based models.

# 基数の多い(high cardinality)カテゴリ変数にOHEを適用すると、ツリーベースの手法では非効率になることがあります。アルゴリズムにより、連続変数がダミー変数よりも重要視されるようになるため、特徴量の重要度の順序が不明瞭になり、パフォーマンスが低下する可能性があります。

### Training

In [0]:
# Perform random search as a method for tuning hyperparameters.

# randome search
# 指定したパラメータ範囲の組み合わせ(e.g. maxDepth:[2, 5, 10 ], numTrees:[5, 10])を指定した探索回数分ランダムに探索し、最も精度(評価指標)が高い組み合わせを採択する方法。
# 違いはGridSearchCVではなく、RandomizedSearchCVを使うこと

from sklearn import svm, datasets, linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
from sklearn.datasets import load_iris

iris = load_iris()
logistic = linear_model.LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
clfrand = RandomizedSearchCV(logistic, distributions, random_state=0)
searchrand = clfrand.fit(iris.data, iris.target)


# grid search
# 指定したパラメータ範囲の組み合わせ(e.g. maxDepth:[2, 5, 10], numTrees:[5, 10])を網羅的に探索し、最も精度(評価指標)が高い組み合わせを採択する方法。
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clfgs = GridSearchCV(svc, parameters)
searchgs = clfgs.fit(iris.data, iris.target)

In [0]:
# Describe the basics of Bayesian methods for tuning hyperparameters.

# ベイズの定理を利用して、以前までに得た組み合わせの結果からある値が小さくなるような組み合わせを探索候補にして、最も精度(評価指標)が高い組み合わせを採択する方法。

In [0]:
# Describe why parallelizing sequential/iterative models can be difficult.

# Gradient boostingのアルゴリズムはiterative、弱学習器(小さいモデル)を作るときに以前のモデルの誤差を使用する
# ので、処理の分散をさせるとノード間で誤差のやり取りが発生するので、難しい

In [0]:
# Understand the balance between compute resources and parallelization.

# parallelismを設定する際、処理の速度と適応性(≒精度)がトレードオフになる点注意

# https://hyperopt.github.io/hyperopt/scaleout/spark/

In [0]:
# Parallelize the tuning of hyperparameters using Hyperopt and SparkTrials.

# single-machine hyperopt with a distributed training algorithm (e.g. MLlib)
# SparkMLのモデルでhyperoptを使う場合は以下の通り
num_evals = 4
trials = Trials()
best_hyperparam = fmin(fn=objective_function, 
                       space=search_space,
                       algo=tpe.suggest, 
                       max_evals=num_evals,
                       trials=trials,
                       rstate=np.random.default_rng(42))

# distributed hyperopt with single-machine training algorithms (e.g. scikit-learn) with the SparkTrials class.
# sklearnのモデルでhyperoptを使う場合は以下の通り
num_evals = 4
spark_trials = SparkTrials(parallelism=2)
best_hyperparam = fmin(fn=objective_function, 
                       space=search_space,
                       algo=tpe.suggest, 
                       trials=spark_trials,
                       max_evals=num_evals,
                       rstate=np.random.default_rng(42))

In [0]:
# Identify the usage of SparkTrials as the tool that enables parallelization for tuning single-node models.

# parallelism The maximum number of trials to evaluate concurrently. Greater parallelism allows scale-out testing of more hyperparameter settings. Defaults to Spark SparkContext.defaultParallelism.


### Evaluation and Selection

In [0]:
# Describe cross-validation and the benefits of downsides of using cross-validation over a train-validation split.

# n-fold クロスバリデーションでは、n-1/nのデータで学習し、残りの1/nのデータ（ホールドアウトセット）で評価します。このプロセスをn回繰り返し、各foldが検証セットとして利用されます。そして、n回の結果の平均を取ります。

In [0]:
# Perform cross-validation as a part of model fitting.

# cvにpipelineを含める場合
# pros: データ漏洩の可能性が低い
# cons: string indexerのようなestimator/transformerがある場合、foldのdatasetに対して毎回変換をかけることになる
stages = [string_indexer, vec_assembler, rf]
pipeline = Pipeline(stages=stages)
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction")
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=param_grid, 
                    numFolds=3, seed=42)
cv_model = cv.fit(train_df)

# pipelineにcvを含める場合
# pros: 変換後にfoldのdatasetに分割するため、処理速度向上が見込める
# cons: データ漏洩の可能性がある
cv = CrossValidator(estimator=rf, evaluator=evaluator, estimatorParamMaps=param_grid, 
                    numFolds=3, seed=42)
stages_with_cv = [string_indexer, vec_assembler, cv]
pipeline = Pipeline(stages=stages_with_cv)
pipeline_model = pipeline.fit(train_df)

In [0]:
# Identify the number of models being trained in conjunction with a grid-search and cross-validation process.

# パラメータの組み合わせ×foldの数
# 以下の場合だと、(2*2) * 3 = 12回
param_grid = (ParamGridBuilder()
              .addGrid(rf.maxDepth, [2, 5])
              .addGrid(rf.numTrees, [5, 10])
              .build())
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction")
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=param_grid, 
                    numFolds=3, seed=42)          

In [0]:
# Describe Recall and F1 as evaluation metrics.

# PPT参照

In [0]:
# Identify the need to exponentiate the RMSE when the log of the label variable is used.

# 目的変数の分布が歪んでいるときに、logをとって正規分布に近づけることでモデルの精度が向上する場合がある
# RMSEはrootをとって単位を合わせるので、正しくRMSEを解釈するために、logではなく、実数に戻す必要あり。そのときにexponentiateする

log_train_df = train_df.withColumn("log_price", log(col("price"))) #学習データ
log_test_df = test_df.withColumn("log_price", log(col("price"))) #テストデータ

r_formula = RFormula(formula="log_price ~ . - price", featuresCol="features", labelCol="log_price", handleInvalid="skip") 

lr.setLabelCol("log_price").setPredictionCol("log_pred")
pipeline = Pipeline(stages=[r_formula, lr])
pipeline_model = pipeline.fit(log_train_df)
pred_df = pipeline_model.transform(log_test_df)

#exponentiateしない場合
exp_df_noexp = pred_df.withColumn("prediction", col("log_pred"))

regression_evaluator_noexp = RegressionEvaluator(labelCol="log_price", predictionCol="prediction")
rmse_noexp = regression_evaluator.setMetricName("rmse").evaluate(exp_df_noexp)
print(f"RMSE is {rmse_noexp}")

#exponentiateする場合
exp_df = pred_df.withColumn("prediction", exp(col("log_pred")))

rmse = regression_evaluator.setMetricName("rmse").evaluate(exp_df)
print(f"RMSE is {rmse}")