1. Инициализация PySpark фреймворка

In [1]:
import numpy as np
import pandas as pd
import os

Импорт библиотек Spark SQL и Spark ML

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

spark = SparkSession.builder.master("local[*]").getOrCreate()

Загрузка исходных данных

In [3]:
dataframe = spark.read.csv('filtered_data/filtered_data.csv', inferSchema=True, header=True, sep=';')
dataframe.limit(5).toPandas()

Unnamed: 0,timestamp,site_id,period_id,actual_consumption,actual_pv,load_00,load_01,load_02,load_03,load_04,...,pv_86,pv_87,pv_88,pv_89,pv_90,pv_91,pv_92,pv_93,pv_94,pv_95
0,2014-07-19 18:45:00,1,0,51.625703,22.712489,52.816828,53.501688,54.079161,52.683472,52.590445,...,81.286349,77.580181,58.433449,55.57297,53.792765,51.999028,43.290921,34.161343,28.349901,24.636241
1,2014-07-19 19:30:00,1,0,52.281257,6.618605,51.452796,51.676287,51.329882,51.690879,51.538671,...,55.571058,53.791009,51.997415,43.28944,34.159983,28.348652,24.635094,19.147728,12.98284,7.928666
2,2014-07-19 20:00:00,1,0,50.719565,1.452209,51.313898,52.199835,52.340547,51.844138,52.661063,...,51.996045,43.288181,34.158827,28.34759,24.634119,19.146832,12.982018,7.927911,5.622566,3.299011
3,2014-07-19 20:15:00,1,0,51.901162,0.580877,51.950475,51.624345,50.867434,51.538997,51.331161,...,43.28806,34.158716,28.347489,24.634026,19.146747,12.981939,7.927839,5.622499,3.29895,2.130662
4,2014-07-19 21:00:00,1,0,51.250007,0.0,52.21882,52.176852,51.745211,51.809854,51.83571,...,24.633942,19.14667,12.981869,7.927774,5.62244,3.298896,2.130612,1.352937,1.246175,1.246205


In [4]:
data = dataframe.select("timestamp", "site_id", "period_id", "actual_consumption", "actual_pv", "load_00", ((col("load_01") > 80).cast("Int").alias("label"))).withColumn("timestamp", unix_timestamp("timestamp").cast(DoubleType()))
data.show(10)

+-----------+-------+---------+------------------+------------------+------------------+-----+
|  timestamp|site_id|period_id|actual_consumption|         actual_pv|           load_00|label|
+-----------+-------+---------+------------------+------------------+------------------+-----+
|1.4057955E9|      1|        0| 51.62570299494799| 22.71248932566911| 52.81682785868848|    0|
|1.4057982E9|      1|        0| 52.28125674965801| 6.618605013254157|51.452796259410526|    0|
|   1.4058E9|      1|        0| 50.71956514220455|1.4522094578011435| 51.31389786752856|    0|
|1.4058009E9|      1|        0| 51.90116154382357|0.5808771932574699| 51.95047496345374|    0|
|1.4058036E9|      1|        0| 51.25000680775122|               0.0|  52.2188201830341|    0|
|1.4058099E9|      1|        0| 51.79032626969339|               0.0| 51.85754836350091|    0|
|1.4058108E9|      1|        0|52.460696718996665|               0.0|52.347502377808425|    0|
|1.4058117E9|      1|        0|  51.8311171697882|

## Разделим данные

Используем 70% данных для обучения, а 30% оставим для тестирования. В данных тестирования столбец*binary_load_00*l переименован в* trueLabe*l, поэтому можно использовать его позже для сравнения прогнозируемых меток с известными фактическими значениями.

In [5]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

Training Rows: 656054  Testing Rows: 281196


Задача регрессии (Случайный лес)

In [6]:
# Создание столбца признаков для задачи регрессии
catVect = VectorAssembler(inputCols=["timestamp", "site_id", "period_id"], outputCol="reg_catFeatures")
catIdx = VectorIndexer(inputCol=catVect.getOutputCol(), outputCol="reg_idxCatFeatures")
numVect = VectorAssembler(inputCols=["actual_consumption", "actual_pv"], outputCol="reg_numFeatures")
featVect = VectorAssembler(inputCols=["reg_idxCatFeatures", "reg_numFeatures"], outputCol="reg_features")

# Создание модели RandomForestRegressor
rf = RandomForestRegressor(labelCol="load_00", featuresCol="reg_features", numTrees=10)

# Создание и выполнение пайплайна для задачи регрессии
reg_pipeline = Pipeline(stages=[catVect, catIdx, numVect, featVect, rf])
reg_model = reg_pipeline.fit(train)
reg_prediction = reg_model.transform(test)

# Вывод результатов
reg_prediction.select("reg_features", "prediction", "load_00").show(10, truncate=False)

+----------------------------------------------------------+-----------------+------------------+
|reg_features                                              |prediction       |load_00           |
+----------------------------------------------------------+-----------------+------------------+
|[1.4057955E9,0.0,0.0,51.62570299494799,22.71248932566911] |60.27787074823942|52.81682785868848 |
|[1.4057964E9,0.0,0.0,53.35653686902011,16.678008122276246]|60.27787074823942|53.45016378873357 |
|[1.4058E9,0.0,0.0,50.71956514220455,1.4522094578011435]   |51.73098854383092|51.31389786752856 |
|(5,[0,3],[1.4058045E9,52.16692225615697])                 |60.27787074823942|52.02819438156861 |
|(5,[0,3],[1.4058099E9,51.79032626969339])                 |60.27787074823942|51.85754836350091 |
|(5,[0,3],[1.4058126E9,52.81485771040811])                 |60.27787074823942|52.64876180747015 |
|(5,[0,3],[1.4058198E9,52.094058511085656])                |60.27787074823942|52.30283050132776 |
|(5,[0,3],[1.4058252

In [7]:
# Расчет метрик для задачи регрессии
reg_evaluator = RegressionEvaluator(labelCol="load_00", predictionCol="prediction", metricName="rmse")
rmse = reg_evaluator.evaluate(reg_prediction)
print("Root Mean Squared Error (RMSE) = ", rmse)

Root Mean Squared Error (RMSE) =  13.763735225920756


Задача бинарной классификации (случайный лес)

In [9]:
catVect = VectorAssembler(inputCols = ["timestamp", "site_id", "period_id"], outputCol="catFeatures")
catIdx = VectorIndexer(inputCol = catVect.getOutputCol(), outputCol = "idxCatFeatures")
numVect = VectorAssembler(inputCols = ["actual_consumption", "actual_pv"], outputCol="numFeatures")
minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), outputCol="normFeatures")
featVect = VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], outputCol="features")
lr = LogisticRegression(labelCol="label",featuresCol="features",maxIter=10,regParam=0.3)
pipeline = Pipeline(stages=[catVect, catIdx, numVect, minMax, featVect, lr])

In [10]:
pipelineModel = pipeline.fit(train)

In [11]:
prediction = pipelineModel.transform(test)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show(100, truncate=False)

+---------------------------------------------------------------+----------+---------+
|features                                                       |prediction|trueLabel|
+---------------------------------------------------------------+----------+---------+
|[1.4057955E9,0.0,0.0,0.17383113651013907,0.030066863713005088] |0.0       |0        |
|[1.4057964E9,0.0,0.0,0.17967573349168664,0.022078398806340393] |0.0       |0        |
|[1.4058E9,0.0,0.0,0.17077133358026544,0.0019224393779283667]   |0.0       |0        |
|(5,[0,3],[1.4058045E9,0.17565869980653756])                    |0.0       |0        |
|(5,[0,3],[1.4058099E9,0.17438702850479118])                    |0.0       |0        |
|(5,[0,3],[1.4058126E9,0.17784661723575013])                    |0.0       |0        |
|(5,[0,3],[1.4058198E9,0.17541265699972203])                    |0.0       |0        |
|(5,[0,3],[1.4058252E9,0.17231619072075377])                    |0.0       |0        |
|(5,[0,3],[1.4058261E9,0.17543021219450028]

In [12]:
tp = float(predicted.filter("prediction == 1.0 AND truelabel == 1").count())
fp = float(predicted.filter("prediction == 1.0 AND truelabel == 0").count())
tn = float(predicted.filter("prediction == 0.0 AND truelabel == 0").count())
fn = float(predicted.filter("prediction == 0.0 AND truelabel == 1").count())
pr = tp / (tp + fp)
re = tp / (tp + fn)
metrics = spark.createDataFrame([
 ("TP", tp),
 ("FP", fp),
 ("TN", tn),
 ("FN", fn),
 ("Precision", pr),
 ("Recall", re),
 ("F1", 2*pr*re/(re+pr))],["metric", "value"])
metrics.show()

+---------+------------------+
|   metric|             value|
+---------+------------------+
|       TP|           91564.0|
|       FP|            4129.0|
|       TN|          155469.0|
|       FN|           30034.0|
|Precision|0.9568515983405265|
|   Recall|0.7530058060165463|
|       F1|0.8427776576112219|
+---------+------------------+



In [13]:
evaluator = BinaryClassificationEvaluator(labelCol="trueLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
aur = evaluator.evaluate(prediction)
print ("AUR = ", aur)

AUR =  0.9798768483101248


In [14]:
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.1]).addGrid(lr.maxIter, [10, 5]).addGrid(lr.threshold, 
                                                                                            [0.4, 0.3]).build()
cv = CrossValidator(estimator=pipeline, evaluator=BinaryClassificationEvaluator(), estimatorParamMaps=paramGrid, 
                    numFolds=2)

model = cv.fit(train)

In [15]:
newPrediction = model.transform(test)
newPredicted = prediction.select("features", "prediction", "trueLabel")
newPredicted.show()

+--------------------+----------+---------+
|            features|prediction|trueLabel|
+--------------------+----------+---------+
|[1.4057955E9,0.0,...|       0.0|        0|
|[1.4057964E9,0.0,...|       0.0|        0|
|[1.4058E9,0.0,0.0...|       0.0|        0|
|(5,[0,3],[1.40580...|       0.0|        0|
|(5,[0,3],[1.40580...|       0.0|        0|
|(5,[0,3],[1.40581...|       0.0|        0|
|(5,[0,3],[1.40581...|       0.0|        0|
|(5,[0,3],[1.40582...|       0.0|        0|
|(5,[0,3],[1.40582...|       0.0|        0|
|(5,[0,3],[1.40582...|       0.0|        0|
|(5,[0,3],[1.40583...|       0.0|        0|
|(5,[0,3],[1.40583...|       0.0|        0|
|[1.4058405E9,0.0,...|       0.0|        0|
|[1.4058414E9,0.0,...|       0.0|        0|
|[1.4058432E9,0.0,...|       0.0|        0|
|[1.4058441E9,0.0,...|       0.0|        0|
|[1.405845E9,0.0,0...|       0.0|        0|
|[1.4058459E9,0.0,...|       0.0|        0|
|[1.4058477E9,0.0,...|       0.0|        0|
|[1.4058486E9,0.0,...|       0.0

In [16]:
# Recalculate confusion matrix
tp2 = float(newPrediction.filter("prediction == 1.0 AND truelabel == 1").count())
fp2 = float(newPrediction.filter("prediction == 1.0 AND truelabel == 0").count())
tn2 = float(newPrediction.filter("prediction == 0.0 AND truelabel == 0").count())
fn2 = float(newPrediction.filter("prediction == 0.0 AND truelabel == 1").count())
pr2 = tp2 / (tp2 + fp2)
re2 = tp2 / (tp2 + fn2)
metrics2 = spark.createDataFrame([
 ("TP", tp2),
 ("FP", fp2),
 ("TN", tn2),
 ("FN", fn2),
 ("Precision", pr2),
 ("Recall", re2),
 ("F1", 2*pr2*re2/(re2+pr2))],["metric", "value"])
metrics2.show()

+---------+------------------+
|   metric|             value|
+---------+------------------+
|       TP|          121422.0|
|       FP|           33209.0|
|       TN|          126389.0|
|       FN|             176.0|
|Precision|0.7852371128687003|
|   Recall|0.9985526077731541|
|       F1|0.8791401337296229|
+---------+------------------+



In [17]:
# Recalculate the Area Under ROC
evaluator2 = BinaryClassificationEvaluator(labelCol="trueLabel", rawPredictionCol="prediction", metricName="areaUnderROC")
aur2 = evaluator.evaluate(prediction)
print( "AUR2 = ", aur2)

AUR2 =  0.9798781455875281
