In [12]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import warnings
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

In [13]:
warnings.simplefilter(action='ignore')

# display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [14]:
spark = SparkSession.builder \
    .appName("sensors_realtime_prediction") \
    .master("local[2]") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [28]:
pandasDF = pd.read_csv("sensor-data.csv")

In [29]:
pandasDF["label"].value_counts()

0    126289
1      9097
Name: label, dtype: int64

In [30]:
df_0 = pandasDF.loc[pandasDF["label"] == 0].sample(50000)
df_1 = pandasDF.loc[pandasDF["label"] == 1].sample(9097)
df_sampled = pd.concat([df_0, df_1], axis=0)

In [31]:
df_sampled = df_sampled.sort_values(by=['time'])

In [32]:
df_sampled["label"].value_counts()

0    50000
1     9097
Name: label, dtype: int64

In [33]:
df_sampled.head(3)

Unnamed: 0,co2_value,temp_value,light_value,humidity_value,time,room,label
2,465.0,22.8,165.0,52.4,2013-08-23 23:04:57,644,0
1,579.0,24.37,176.0,49.9,2013-08-23 23:04:57,656A,1
4,434.0,24.08,11.0,49.94,2013-08-23 23:05:01,564,1


In [34]:
# df_sampled.to_csv("test-data-sampled.csv",index=False)

In [35]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_sampled, test_size=0.2)

In [36]:
print(train["label"].value_counts())
print(test["label"].value_counts())

0    40027
1     7250
Name: label, dtype: int64
0    9973
1    1847
Name: label, dtype: int64


In [37]:
test_df = spark.createDataFrame(test)
train_df = spark.createDataFrame(train)

<h3>Data Preparing for ML Prediction</h3>

In [38]:
label_col = ["pir_value"]

# The Room variable contains 51 categories, so it must go through the stringindexer and one-hot-encoder stages.

# StringIndexer
string_indexer_objs = StringIndexer(inputCol="room",
                                    outputCol="roomIdx",
                                    handleInvalid='error')

# One Hot Encoder
encoder = OneHotEncoder(inputCols=["roomIdx"],
                        outputCols=["ohe_col"],
                        handleInvalid='error')

# Vector Assembler
# Vector assembler should not have a target.
assembler = VectorAssembler(inputCols=['co2_value', "temp_value", "light_value", "humidity_value", 'ohe_col'],
                            outputCol='features',
                            handleInvalid='skip')

<h3>Create Model</h3>

In [39]:
from pyspark.ml.classification import RandomForestClassifier

estimator = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)

# Pipeline
pipeline_obj = Pipeline().setStages([string_indexer_objs, encoder, assembler, estimator])

#train_df, test_df = df_last.randomSplit([.8, .2], seed=142)

test_df.show(5)

pipeline_model = pipeline_obj.fit(train_df)
transformed_df = pipeline_model.transform(test_df)


+---------+----------+-----------+--------------+-------------------+----+-----+
|co2_value|temp_value|light_value|humidity_value|               time|room|label|
+---------+----------+-----------+--------------+-------------------+----+-----+
|    471.0|     23.51|        3.0|         57.19|2013-08-30 05:02:50| 748|    0|
|    586.0|     25.68|      123.0|          50.9|2013-08-28 19:31:37|656B|    1|
|    314.0|     22.85|     2154.0|          53.6|2013-08-24 03:29:28| 668|    0|
|    514.0|     22.67|        5.0|         59.83|2013-08-31 01:51:56| 446|    0|
|    431.0|     22.19|        3.0|         55.89|2013-08-27 08:15:16| 644|    0|
+---------+----------+-----------+--------------+-------------------+----+-----+
only showing top 5 rows



<h3>Performance Evaluation</h3>

In [40]:
# Prediction
transformed_df.show(truncate=False)

# Evaluate the Model
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(transformed_df)
print(evaluator.evaluate(transformed_df))

# 0.9483103977867051

evaluator.getMetricName()

+---------+----------+-----------+--------------+-------------------+----+-----+-------+---------------+-----------------------------------------------+---------------------------------------+-----------------------------------------+----------+
|co2_value|temp_value|light_value|humidity_value|time               |room|label|roomIdx|ohe_col        |features                                       |rawPrediction                          |probability                              |prediction|
+---------+----------+-----------+--------------+-------------------+----+-----+-------+---------------+-----------------------------------------------+---------------------------------------+-----------------------------------------+----------+
|471.0    |23.51     |3.0        |57.19         |2013-08-30 05:02:50|748 |0    |21.0   |(50,[21],[1.0])|(54,[0,1,2,3,25],[471.0,23.51,3.0,57.19,1.0])  |[46.99704782396685,3.0029521760331477] |[0.9399409564793371,0.06005904352066296] |0.0       |
|586.0    |25.68

'areaUnderROC'

<h3>Saving The Model to Disk</h3>

In [None]:
##### 
pipeline_model.write().overwrite().save(
    "/home/selcuk/spark/bitirme-projesi/saved_model2/pipeline_model")

from pyspark.ml.pipeline import PipelineModel

pipeline_model_loaded = PipelineModel.load(
    "/home/selcuk/spark/bitirme-projesi/saved_model2/pipeline_model")

pipeline_model_loaded

In [None]:
test_df.coalesce(1).write \
    .format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save("file:///home/train/atscale4/final_homework/test_df")