In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, SQLTransformer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import functools
from pyspark.ml.classification import RandomForestClassifier

In [3]:
spark = SparkSession.builder.appName('RF_trainer_2').getOrCreate()


In [16]:
path_train = '../data/train_final_0'
print('Carga del TRAIN', path_train)
train = spark.read \
    .options(header = "true", sep=',', inferschema = "true") \
    .csv(path_train)

# train.persist()
# print("Numero de casos en el train: %d" % train.count())



Carga del TRAIN ../data/train_final_0


In [None]:
train.show()

In [17]:
train_cols = train.columns
train_cols.remove('MachineIdentifier')
train_cols.remove('HasDetections')

# Convertimos el TRAIN en un VECTOR para poder pasarle el RF
print('Conversion de datos a VectorAssembler')
assembler_features = VectorAssembler(inputCols=train_cols, outputCol='features')
train_data = assembler_features.transform(train)


Conversion de datos a VectorAssembler


In [18]:
x=set(train_data.columns)
y=set(['MachineIdentifier','HasDetections','features'])
re = x - y
res = list(re)
for x in res:
    train_data = train_data.drop(x)

In [None]:
train_data.select('MachineIdentifier','Features','HasDetections').show(15,False)

In [19]:
train_data.show()

+--------------------+-------------+--------------------+
|   MachineIdentifier|HasDetections|            features|
+--------------------+-------------+--------------------+
|0000a998901524f98...|            0|[0.0,7.0,0.0,0.0,...|
|0000a998901524f98...|            0|[0.0,7.0,0.0,0.0,...|
|000141cf4d5de00fa...|            1|[0.0,7.0,0.0,0.0,...|
|000141cf4d5de00fa...|            1|[0.0,7.0,0.0,0.0,...|
|0001cc2b237f8b5fb...|            1|[0.0,7.0,0.0,0.0,...|
|0001cc2b237f8b5fb...|            1|(62,[1,4,5,6,7,8,...|
|00022e56d5eede2d3...|            1|[0.0,7.0,0.0,0.0,...|
|00022e56d5eede2d3...|            1|(62,[1,4,5,6,7,8,...|
|0002ef620e9509ece...|            1|[0.0,7.0,0.0,0.0,...|
|0002ef620e9509ece...|            1|[0.0,7.0,0.0,0.0,...|
|00043933ab74b6d83...|            0|[0.0,7.0,0.0,0.0,...|
|00043933ab74b6d83...|            0|[0.0,7.0,0.0,0.0,...|
|0004a2a4344d5dc3f...|            1|[0.0,7.0,0.0,0.0,...|
|0004a2a4344d5dc3f...|            1|[0.0,7.0,0.0,0.0,...|
|0004ce3ed71a1

In [20]:
numFolds = 3

print('Creamos modelo')
rf = RandomForestClassifier(labelCol="HasDetections", featuresCol="features")
evaluator = BinaryClassificationEvaluator(rawPredictionCol="features",
                                          labelCol="HasDetections",
                                          metricName="areaUnderROC")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [3, 10])\
    .build()
# .addGrid(...)  # Add other parameters

print('Creamos cross-validador con {} folds'.format(numFolds))
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=numFolds)



Creamos modelo
Creamos cross-validador con 3 folds


In [21]:
print('Entrenando modelo...')
model = crossval.fit(train_data)
print('Fin del train')


Entrenando modelo...


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/rde/stuff_workspace/entornos/microkaggle/lib/python3.6/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/rde/stuff_workspace/entornos/microkaggle/lib/python3.6/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/home/rde/stuff_workspace/entornos/microkaggle/lib/python3.6/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44849)
Traceback (most recent call last)

Py4JError: An error occurred while calling o337.fit

In [None]:
print(model.bestModel.summary)

In [None]:
print(model.bestModel.summary())

In [None]:
rfModel = model.stages[2]
print(rfModel)  # summary only