In [36]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import warnings
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import rand
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.pipeline import PipelineModel

In [2]:
warnings.simplefilter(action='ignore')

# display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Spark Session

In [3]:
spark = SparkSession.builder \
    .appName("sensors_realtime_prediction") \
    .master("local[2]") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [4]:
pandasDF = pd.read_csv("../test_df/sensor-data.csv")

In [5]:
pandasDF["label"].value_counts()

0    126289
1      9097
Name: label, dtype: int64

In [6]:
df_0 = pandasDF.loc[pandasDF["label"] == 0].sample(50000)
df_1 = pandasDF.loc[pandasDF["label"] == 1].sample(9097)
df_sampled = pd.concat([df_0, df_1], axis=0)

In [7]:
df_sampled = df_sampled.sort_values(by=['time'])

In [8]:
df_sampled["label"].value_counts()

0    50000
1     9097
Name: label, dtype: int64

In [9]:
df_sampled.head(3)

Unnamed: 0,co2_value,temp_value,light_value,humidity_value,time,room,label
2,465.0,22.8,165.0,52.4,2013-08-23 23:04:57,644,0
1,579.0,24.37,176.0,49.9,2013-08-23 23:04:57,656A,1
4,434.0,24.08,11.0,49.94,2013-08-23 23:05:01,564,1


In [10]:
# df_sampled.to_csv("test-data-sampled.csv",index=False)

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_sampled, test_size=0.2)

In [12]:
print(train["label"].value_counts())
print(test["label"].value_counts())

0    40009
1     7268
Name: label, dtype: int64
0    9991
1    1829
Name: label, dtype: int64


In [13]:
test_df = spark.createDataFrame(test)
train_df = spark.createDataFrame(train)

<h3>Data Preparing for ML Prediction</h3>

In [14]:
label_col = ["pir_value"]

# The Room variable contains 51 categories, so it must go through the stringindexer and one-hot-encoder stages.

# StringIndexer
string_indexer_objs = StringIndexer(inputCol="room",
                                    outputCol="roomIdx",
                                    handleInvalid='error')

In [15]:
# One Hot Encoder
encoder = OneHotEncoder(inputCols=["roomIdx"],
                        outputCols=["ohe_col"],
                        handleInvalid='error')

In [16]:
# Vector Assembler
# Vector assembler should not have a target.
assembler = VectorAssembler(inputCols=['co2_value', "temp_value", "light_value", "humidity_value", 'ohe_col'],
                            outputCol='features',
                            handleInvalid='skip')

<h3>Create Model</h3>

In [17]:
from pyspark.ml.classification import RandomForestClassifier
estimator = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)

#train_df, test_df = df_last.randomSplit([.8, .2], seed=142)

In [18]:
# Pipeline
pipeline_obj = Pipeline().setStages([string_indexer_objs, encoder, assembler, estimator])
# pipeline_model = pipeline_obj.fit(train_df)
# transformed_df = pipeline_model.transform(test_df)

In [19]:
param_grid = ParamGridBuilder() \
    .addGrid(estimator.maxDepth, [10, 15, 25]) \
    .addGrid(estimator.numTrees, [10, 15, 25]) \
    .build()

<h3>Performance Evaluation</h3>

In [20]:
cv = CrossValidator(
    estimator=pipeline_obj,
    estimatorParamMaps=param_grid,
    evaluator=BinaryClassificationEvaluator(metricName="areaUnderROC"),
    numFolds=5
)

cv_model = cv.fit(train_df)

predictions = cv_model.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol="label")

In [21]:
best_parameters = {param.name: value for param, value in zip(cv_model.bestModel.stages[-1].extractParamMap().keys(), cv_model.bestModel.stages[-1].extractParamMap().values())}
best_parameters

{'bootstrap': True,
 'cacheNodeIds': False,
 'checkpointInterval': 10,
 'featureSubsetStrategy': 'auto',
 'featuresCol': 'features',
 'impurity': 'gini',
 'labelCol': 'label',
 'leafCol': '',
 'maxBins': 32,
 'maxDepth': 25,
 'maxMemoryInMB': 256,
 'minInfoGain': 0.0,
 'minInstancesPerNode': 1,
 'minWeightFractionPerNode': 0.0,
 'numTrees': 25,
 'predictionCol': 'prediction',
 'probabilityCol': 'probability',
 'rawPredictionCol': 'rawPrediction',
 'seed': 2693163115266358200,
 'subsamplingRate': 1.0}

In [23]:
accuracy = evaluator.evaluate(predictions)

print(accuracy)
# Prediction
# 0.9483103977867051

evaluator.getMetricName()

0.9926513413739947


'areaUnderROC'

<h3>Saving The Model to Disk</h3>

In [38]:
cv_model.bestModel.write().overwrite().save('/home/selcuk/bitirme/cv_model/pipeline_model')

In [42]:
pipeline_model_loaded = PipelineModel.load(
    "/home/selcuk/bitirme/cv_model/pipeline_model")

<bound method Params.extractParamMap of RandomForestClassificationModel: uid=RandomForestClassifier_217eb792b48f, numTrees=25, numClasses=2, numFeatures=54>

In [44]:
best_parameters = {param.name: value for param, value in zip(pipeline_model_loaded.stages[-1].extractParamMap().keys(), pipeline_model_loaded.stages[-1].extractParamMap().values())}
best_parameters

{'bootstrap': True,
 'cacheNodeIds': False,
 'checkpointInterval': 10,
 'featureSubsetStrategy': 'auto',
 'featuresCol': 'features',
 'impurity': 'gini',
 'labelCol': 'label',
 'leafCol': '',
 'maxBins': 32,
 'maxDepth': 25,
 'maxMemoryInMB': 256,
 'minInfoGain': 0.0,
 'minInstancesPerNode': 1,
 'minWeightFractionPerNode': 0.0,
 'numTrees': 25,
 'predictionCol': 'prediction',
 'probabilityCol': 'probability',
 'rawPredictionCol': 'rawPrediction',
 'seed': 2892013887635323293,
 'subsamplingRate': 1.0}

In [None]:
# # ##### 
# pipeline_model.write().overwrite().save(
#     "/home/selcuk/bitirme/saved_model2/pipeline_model")


# pipeline_model_loaded = PipelineModel.load(
#     "/home/selcuk/bitirme/saved_model2/pipeline_model")



In [None]:
# test_df.coalesce(1).write \
#     .format("csv") \
#     .mode("overwrite") \
#     .option("header", "true") \
#     .save("..../test_df")