# spark-featurization-demo

This Databricks notebook is part of the tutorial ["Using Spark for model featurization with Spell"](https://spell.run/blog/using-spark-for-model-featurization-with-spell-XnEedBUAACcAjfTV). In this notebook we step through:

* Loading a dataset in Spark
* Featurizing it and building a simple baseline model on top of it using Spark ML
* Writing it out to an S3 bucket for long-term storage and for use within a Spell workspace

For comments on what the code does, see the comments in the complimentary blog post, or alternatively try playing with and modifying the code cells yourself.

In [2]:
#
# Make sure you have uploaded the dataset to the train.csv path in the filestore first!
# See the README.md in the repo for details on how to do this.
#

import pandas as pd
import os
train = (
  spark.read.format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .option("sep", ",")
  .load("/FileStore/tables/train.csv")
)
display(train)

Id,Hazard,T1_V1,T1_V2,T1_V3,T1_V4,T1_V5,T1_V6,T1_V7,T1_V8,T1_V9,T1_V10,T1_V11,T1_V12,T1_V13,T1_V14,T1_V15,T1_V16,T1_V17,T2_V1,T2_V2,T2_V3,T2_V4,T2_V5,T2_V6,T2_V7,T2_V8,T2_V9,T2_V10,T2_V11,T2_V12,T2_V13,T2_V14,T2_V15
1,1,15,3,2,N,B,N,B,B,D,7,B,B,15,1,A,B,N,36,11,N,10,B,2,37,1,11,6,Y,N,E,2,2
2,4,16,14,5,H,B,N,B,B,C,12,B,B,10,3,A,B,Y,78,10,Y,17,C,2,22,1,18,5,Y,Y,E,2,1
3,1,10,10,5,N,K,N,B,B,E,12,H,B,15,1,A,R,Y,71,21,Y,13,C,6,37,2,14,6,Y,Y,E,6,1
4,1,18,18,5,N,K,N,B,B,E,3,H,B,15,1,A,R,N,71,13,N,15,A,2,25,1,1,6,Y,N,C,2,6
5,1,13,19,5,N,H,N,B,B,E,7,H,B,10,1,A,J,N,75,10,Y,11,B,1,22,1,2,7,N,N,E,1,1
12,1,14,12,2,N,K,N,B,B,E,12,H,B,15,1,A,K,N,65,10,N,14,A,1,37,1,5,7,N,N,A,1,9
15,5,8,17,1,E,K,N,B,B,E,8,H,B,20,1,A,K,N,100,14,N,16,A,2,25,1,20,3,Y,N,D,2,11
19,1,14,20,4,E,K,N,B,B,E,3,I,B,15,1,D,R,N,83,13,Y,5,A,2,40,1,18,7,Y,N,E,3,2
21,1,8,2,2,W,C,N,D,B,D,8,B,B,5,1,A,D,N,20,12,Y,4,B,1,34,1,13,5,N,N,A,2,1
22,1,5,4,3,B,I,N,D,B,F,8,H,B,20,3,A,K,Y,88,7,N,14,A,4,40,1,6,3,Y,Y,E,4,1


In [3]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline

trans = []

str_cols = [col for (col, dtype) in train.dtypes if dtype == 'string']
for col in str_cols:
  t = StringIndexer(inputCol=col, outputCol=f"{col}_C")
  trans.append(t)

input_cols = [c for c in train.columns if c not in [*str_cols, *['Id', 'Hazard']]]
input_cols += [c + '_C' for c in  str_cols]
t = VectorAssembler(outputCol='indexedFeatures').setInputCols(input_cols)
trans.append(t)

trans = Pipeline(stages=trans).fit(train)
X = trans.transform(train)
X = X.withColumnRenamed("Hazard", "label")

In [4]:
from pyspark.ml.regression import LinearRegression

clf = LinearRegression(featuresCol='indexedFeatures', labelCol='label')
clf = clf.fit(X)
X_w_y_pred = clf.transform(X)

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator

metric = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')
rmse = metric.evaluate(X_w_y_pred)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

`Root Mean Squared Error (RMSE) on test data = 3.91967`

In [7]:
(X
 .drop('indexedFeatures')
 .repartition(1)
 .write.format("csv")
 .option("header","true")
 .mode("Overwrite")
 #
 # Replace this S3 path with a path into an S3 bucket you have access to!
 # Alternatively, if you are using GCS, try a GCS bucket path instead.
 #
 .save('s3a://spell-share/aleksey/scratch/liberty-mutual-group-property-inspection/train'))