In [0]:
!pip install mlflow

In [0]:
!pip install hyperopt

In [0]:
#Import Libraries
import mlflow
import mlflow.spark
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession

In [0]:
# Note: On environments like Databricks with pre-created SparkSessions,
# ensure the org.mlflow:mlflow-spark:1.11.0 is attached as a library to
# your cluster
spark = (SparkSession.builder
            .config("spark.jars.packages", "org.mlflow:mlflow-spark:1.11.0")
            .master("local[*]")
            .getOrCreate())

##Basic EDA

In [0]:
#Read parquet 
data = spark.read.parquet("/tmp/output/higgsdf.parquet")

In [0]:
#Check data
data.show(5)

In [0]:
#Check the label distribution
labelcounts = data.groupBy('label').count().show()

In [0]:
# import sql function pyspark
import pyspark.sql.functions as f

# null values in each column
data_agg = data.agg(*[f.count(f.when(f.isnull(c), c)).alias(c) for c in data.columns])
data_agg.show()

In [0]:
features =['feature1','feature2','feature3','feature4','feature5','feature6','feature7','feature8','feature9','feature10','feature11','feature12','feature13','feature14','feature15','feature16','feature17','feature18','feature19','feature20','feature21','feature22','feature23','feature24','feature25','feature26','feature27','feature28']

##Vanilla Pipeline

In [0]:
stages = [] #stages in pipeline

In [0]:
#Start by using vector assembler to truncate all the columns
vecAssembler = VectorAssembler(inputCols = features, outputCol="features")
stages += [vecAssembler]

In [0]:
#Check to see vector assembler output

testdf = vecAssembler.transform(data)
testdf.head().features

In [0]:
#Scale the data
scaler = MinMaxScaler( min=-1.0, max=1.0,inputCol="features", outputCol="scaledFeatures")
stages += [scaler]
stages

In [0]:
#Test the scaler
# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(testdf)

# rescale each feature to range [min, max].
testdf2 = scalerModel.transform(testdf)

In [0]:
#Build baseline classifier
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)

stages += [lr]

In [0]:
#Split train and test label
train,test = data.randomSplit([0.8, 0.2])

In [0]:
#Check the ratio 
train.groupBy('label').count().show()

In [0]:
# Enable Spark datasource autologging.
try:
  import mlflow.pyspark.ml
  mlflow.pyspark.ml.autolog()
except:
  print(f"Your version of MLflow ({mlflow.__version__}) does not support pyspark.ml for autologging. To use autologging, upgrade your MLflow client version or use Databricks Runtime for ML 8.3 or above.")

In [0]:


#Build the pipeline
with mlflow.start_run(run_name='logreg_baseline') as run:
  Regpipeline = Pipeline(stages = stages) 
  
  model = Regpipeline.fit(train)

  validation_metric = evaluator.evaluate(predictions)
  mlflow.log_metric("val_f1_score", validation_metric)

In [0]:
predicted = model.transform(train)

In [0]:
predicted.select('features', 'label', 'rawPrediction', 'probability', 'prediction').show()