# Hyperparameter-Tuning with GridSearch using a Random Forest Classifier

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

file_path = f"{DA.paths.datasets}/airbnb/sf-listings/sf-listings-2019-03-06-clean.delta/"

airbnb_df = (spark
            .read
            .format("delta")
            .load(file_path)
            .withColumn("priceClass", (col("price") >= 150).cast("int"))
            .drop("price")
           )

train_df, test_df = airbnb_df.randomSplit([.8, .2], seed=42)

categorical_cols = [field for (field, dataType) in train_df.dtypes if dataType == "string"]
index_output_cols = [x + "Index" for x in categorical_cols]

string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid="skip")

numeric_cols = [field for (field, dataType) in train_df.dtypes if ((dataType == "double") & (field != "priceClass"))]
assembler_inputs = index_output_cols + numeric_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

In [None]:
# Random forest

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="priceClass",
                            maxBins=40,
                            seed=38)

## Grid Search 

Let's define a grid of hyperparameters to test:
  - maxDepth: max depth of the decision tree (Use the values **`2, 5, 10`**)
  - numTrees: number of decision trees (Use the values **`10, 20, 100`**)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

grid = ParamGridBuilder().addGrid(rf.maxDepth,[2,5,10]).addGrid(rf.numTrees,[10,20,100]).build()

In [None]:
# Evaluator 

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="priceClass",metricName="areaUnderROC")

## Cross Validation


In [None]:
from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=rf, evaluator=evaluator,estimatorParamMaps=grid,numFolds=3,seed=38)

## Pipeline

In [None]:
stages = [string_indexer, vec_assembler, cv]

pipeline = Pipeline(stages=stages)

pipeline_model = pipeline.fit(train_df)

## Hyperparameter

In [None]:
cv_model = pipeline_model.stages[-1]
rf_model = cv_model.bestModel

# list(zip(cv_model.getEstimatorParamMaps(), cv_model.avgMetrics))

print(rf_model.explainParams())

## Feature Importance

In [None]:
import pandas as pd

pandas_df = pd.DataFrame(list(zip(vec_assembler.getInputCols(), rf_model.featureImportances)), columns=["feature", "importance"])
top_features = pandas_df.sort_values(["importance"], ascending=False)
top_features

In [None]:
# Test the model now


pred_df = pipeline_model.transform(test_df)
area_under_roc = evaluator.evaluate(pred_df)
print(f"Area under ROC is {area_under_roc:.2f}")