In [1]:
import mmlspark

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

spark = SparkSession.builder.appName("SParkCrossVal-2")\
    .getOrCreate()

In [2]:
df = spark.read.format("csv")\
    .load("train.csv", header=True, inferSchema=True)

In [3]:
#Features Vector generated

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=df.columns[2:], outputCol="features")

output = assembler.transform(df)


In [4]:
inputDF = output.selectExpr('target as label', 'features')


In [5]:
inputDF.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[8.9255,-6.7863,1...|
|    0|[11.5006,-4.1473,...|
|    0|[8.6093,-2.7457,1...|
|    0|[11.0604,-2.1518,...|
|    0|[9.8369,-1.4834,1...|
+-----+--------------------+
only showing top 5 rows



In [8]:
from mmlspark import LightGBMClassifier
lgb_model = LightGBMClassifier(learningRate=0.3,
                           numIterations=100,
                           numLeaves=31)

In [9]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [lgb_model])


In [10]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [20]:
paramGrid = ParamGridBuilder()\
    .addGrid(lgb_model.numLeaves, [1,2,4])\
    .addGrid(lgb_model.baggingFreq, [5])\
    .addGrid(lgb_model.baggingFraction, [0.331])\
    .addGrid(lgb_model.boostFromAverage, [False])\
    .addGrid(lgb_model.boostingType, ['gbdt'])\
    .addGrid(lgb_model.featureFraction, [0.0405])\
    .addGrid(lgb_model.learningRate, [0.0083])\
    .addGrid(lgb_model.maxDepth, [-1])\
    .addGrid(lgb_model.numLeaves, [13])\
    .addGrid(lgb_model.minSumHessianInLeaf, [10.0])\
    .addGrid(lgb_model.objective, ['binary'])\
    .addGrid(lgb_model.verbosity, [1])\
    .build()


In [27]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label", predictionCol = "prediction",
                                              metricName = "accuracy")

crossVal = CrossValidator(estimator = pipeline,
                         estimatorParamMaps = paramGrid,
                         evaluator = evaluator,
                         numFolds = 7)

In [28]:
cvModel = crossVal.fit(inputDF)

In [29]:
cvModel.avgMetrics

[0.8995080266554882]

In [30]:
print (cvModel.bestModel.stages[0])

LightGBMClassificationModel_48d4b675526758415e15
