In [None]:
SANDBOX_NAME = '' # Sandbox Name
DATA_PATH = "/data/sandboxes/"+SANDBOX_NAME+"/data/"



# Spark ML Clasificación

Cargamos un dataset observaciones diarias del tiempo desde varias estaciones meteorológicas australianas. La variable target RainTomorrow significa: ¿lloverá mañana? (Yes o No)



### Crear SparkSession

In [None]:
# Respuesta

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()



### Cargar datos y comprobar schema (dataset inicial para ver las variables originales)

In [None]:
# Respuesta

weather = spark.read.csv(DATA_PATH+'data/weather_aus.csv', sep=',', header=True, inferSchema=True)

weather.printSchema()

In [None]:
# Respuesta

weather.show()



### Cargar datos y comprobar schema (dataset con variables dummies)

In [None]:
# Respuesta

weather = spark.read.csv(DATA_PATH+'data/weather_aus_prepared.csv', sep=',', header=True, inferSchema=True)

weather.printSchema()

In [None]:
# Respuesta

weather.show(1)



* Verificar valores nulos

In [None]:
# Respuesta

from pyspark.sql import functions as F

for column in weather.columns:
    if weather.where(F.col(column).isNull()).count() != 0:
        print("\tBe careful: there are null values in the column '{}'".format(column))
    else:
        print("The column '{}' does not have null values".format(column))



Nos disponemos a lanzar un algoritmo de clasificación para predecir si lloverá o no mañana



#### Pasos previos

* VectorAssembler con variables deseadas

Se toman todas aquellas que son numéricas menos la objetivo (en este caso es 'label')

In [None]:
# Respuesta

from pyspark.ml.feature import VectorAssembler

variables_vector_assembler = [element for element in weather.columns if element != 'RainTomorrow']

vector_assemmbler = VectorAssembler(inputCols = variables_vector_assembler, outputCol = 'assembled_features')

weather = vector_assemmbler.transform(weather)

weather.show()



- Partir dataset entre train y test

In [None]:
# Respuesta

weather_train, weather_test = weather.randomSplit([0.8,0.2])



### Regresión Logística

In [None]:
# Respuesta

from pyspark.ml.classification import LogisticRegression

thld_label_1 = 0.45 # Try other values, like:  0.01 , 0.15, 0.30, 0.5 (default)
logistic_regression = LogisticRegression(featuresCol= 'assembled_features', labelCol='RainTomorrow', threshold=thld_label_1 )
print ("Logistic regression threshold for 'RainTomorrow = 1.0' is: ",logistic_regression.getThreshold())

logistic_regression_model = logistic_regression.fit(weather_train)
print("Logistic regression coefficients: " + str(logistic_regression_model.coefficientMatrix))
print("Logistic regression intercept: " + str(logistic_regression_model.interceptVector))

weather_logistic_regression = logistic_regression_model.transform(weather_test)

weather_logistic_regression.show(5, truncate=False)

In [None]:
weather_logistic_regression.select("RainTomorrow", "rawPrediction", "probability","prediction").show()



Lo primero que debemos entender es que el modelo regresión logística de la librería Spark, busca clasificar variables _target_ 'label = 1.0'. Esta clasificación tiene un umbral de corte de probabilidad y está asociado con el parámetro _threshold_ (se recomienda probar distintos valores).

Otra cosa importante del modelo  regresión logística, es que cuando  se realiza para predicción con el método *transform* se obtienen 3 columnas: *rawPrediction*, *probability*  y *prediction*

A modo de ejemplo, se analiza un registro:

| rawPrediction | probability | prediction |
| :----------: | :----------: | :----------: |
| [3.38, -3.38] | [0.96, 0.03]| 0.0|

En la columna *rawPrediction* existen dos valores, el valor -3.38 sale de aplicar la siguiente formula: $\beta X +\beta_0$ Los valores en la columna *probability* salen de aplicar la función sigmoide. Finalmente la columna de *prediction* aplica el _threshold=0.45_ sobre el segundo valor de probabilidad (es decir 0.03) y determinar que dicho registro debe ser clasificado como 0.0




### Random Forest

In [None]:
# Respuesta

from pyspark.ml.classification import RandomForestClassifier

random_forest = RandomForestClassifier(featuresCol= 'assembled_features', labelCol='RainTomorrow',
                                      maxDepth=8, numTrees=128, impurity="gini")

random_forest_model = random_forest.fit(weather_train)
print('Learned classification random forest model:')
print("\t",random_forest_model.getNumTrees)
print("\t",random_forest_model.featureImportances)

weather_random_forest = random_forest_model.transform(weather_test)

weather_random_forest.show(5)

In [None]:
weather_random_forest.select("RainTomorrow", "rawPrediction", "probability","prediction").show()



### Gradient Boosting Trees

In [None]:
# Respuesta

from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol= 'assembled_features', labelCol='RainTomorrow', maxIter=8, maxDepth=10, seed=1023)

gbt_model = gbt.fit(weather_train)

weather_gbt = gbt_model.transform(weather_test)

weather_gbt.show(5)

In [None]:
weather_gbt.select("RainTomorrow", "rawPrediction", "probability","prediction").show()


# Evaluación de los modelos



Importamos las librerias necesarias para evaluar los modelos

In [None]:
# Respuesta

from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics



Imprimir resultados para los distintos modelos

In [None]:
# Respuesta

metrics = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='RainTomorrow')
multimetrics = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='RainTomorrow')

# In binary case this four metrics will return the same value
accuracy = multimetrics.evaluate(weather_logistic_regression, {metrics.metricName: "accuracy"})
recall = multimetrics.evaluate(weather_logistic_regression, {metrics.metricName: "recall"})
precision = multimetrics.evaluate(weather_logistic_regression, {metrics.metricName: "precision"})
f1 = multimetrics.evaluate(weather_logistic_regression, {metrics.metricName: "f1"})

area_under_pr = metrics.evaluate(weather_logistic_regression, {metrics.metricName: "areaUnderPR"})
area_under_roc = metrics.evaluate(weather_logistic_regression, {metrics.metricName: "areaUnderROC"})

# We will call a function from mllib library. Therefore, we will be working with a RDD instead of working with a DataFrame
metrics_rdd = MulticlassMetrics(weather_logistic_regression.select('prediction', 'RainTomorrow').rdd)
confusion_matrix = metrics_rdd.confusionMatrix()

In [None]:
# Respuesta

print("Accuracy: {}".format(accuracy))
print("Recall: {}".format(recall))
print("Precision: {}".format(precision))
print("F1: {}".format(f1))
print("Area under PR: {}".format(area_under_pr))
print("Area under ROC: {}".format(area_under_roc))
print("Confusion matrix: {}".format(confusion_matrix))



Crea una función que reciba como parámetro el nombre de la columna de predicción, la del target, y el dataframe, y devuelva un diccionario con todas las métricas del proyecto de clasificación binaria.

In [None]:
# Respuesta

def calculate_metrics(prediction_column, model_dataframe):
    metrics = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol=prediction_column)
    multimetrics = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=prediction_column)

    # In binary case this four metrics will return the same value
    accuracy = multimetrics.evaluate(model_dataframe, {metrics.metricName: "accuracy"})
    recall = multimetrics.evaluate(model_dataframe, {metrics.metricName: "recall"})
    precision = multimetrics.evaluate(model_dataframe, {metrics.metricName: "precision"})
    f1 = multimetrics.evaluate(model_dataframe, {metrics.metricName: "f1"})
    
    area_under_pr = metrics.evaluate(model_dataframe, {metrics.metricName: "areaUnderPR"})
    area_under_roc = metrics.evaluate(model_dataframe, {metrics.metricName: "areaUnderROC"})

    # We will call a function from mllib library. Therefore, we will be working with a RDD instead of working with a DataFrame
    metrics_rdd = MulticlassMetrics(model_dataframe.select('prediction', prediction_column).rdd)
    confusion_matrix = metrics_rdd.confusionMatrix()
    
    print("Accuracy: {}".format(accuracy))
    print("Recall: {}".format(recall))
    print("Precision: {}".format(precision))
    print("F1: {}".format(f1))
    print("Area under PR: {}".format(area_under_pr))
    print("Area under ROC: {}".format(area_under_roc))
    print("Confusion matrix: {}".format(confusion_matrix))

In [None]:
# Respuesta

models_dictionary = {
    "Logistic regression": weather_logistic_regression,
    "Random Forest": weather_random_forest,
    "GBT": weather_gbt
}

prediction_column = "RainTomorrow"

for k, v in models_dictionary.items():
    print(k)
    calculate_metrics(prediction_column, v)
    print()