<h1>Data Analysis</h1>

In [1]:
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import warnings
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

In [2]:
warnings.simplefilter(action='ignore')

# display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
spark = SparkSession.builder \
    .appName("sensors_realtime_prediction") \
    .master("local[2]") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [4]:
df = spark.read.format("csv") \
    .option("header", False) \
    .option("inferSchema", True) \
    .option("sep", ",") \
    .load("/home/selcuk/spark/bitirme-projesi/KETI/*/*.csv") \
    .withColumn("file_name", F.input_file_name()) \
    .withColumn("_c0", F.to_timestamp("_c0")) \
    .withColumn("room", F.element_at(F.reverse(F.split(F.col("file_name"), "/")), 2)) \
    .withColumn("sensor", F.regexp_replace(F.element_at(F.reverse(F.split(F.col("file_name"), "/")), 1), ".csv", "")) \
    .withColumnRenamed("_c0", "time") \
    .withColumnRenamed("_c1", "value") \
    .drop("file_name")

df.show(n=10, truncate=False)
df.cache()

+-------------------+------+----+------+
|time               |value |room|sensor|
+-------------------+------+----+------+
|2013-08-23 23:05:03|2287.0|668 |light |
|2013-08-23 23:05:05|1977.0|668 |light |
|2013-08-23 23:05:26|2208.0|668 |light |
|2013-08-23 23:05:30|2267.0|668 |light |
|2013-08-23 23:05:33|2097.0|668 |light |
|2013-08-23 23:05:38|2111.0|668 |light |
|2013-08-23 23:05:43|2257.0|668 |light |
|2013-08-23 23:05:48|2200.0|668 |light |
|2013-08-23 23:05:53|2164.0|668 |light |
|2013-08-23 23:05:58|2183.0|668 |light |
+-------------------+------+----+------+
only showing top 10 rows



DataFrame[time: timestamp, value: double, room: string, sensor: string]

In [5]:
df.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- value: double (nullable = true)
 |-- room: string (nullable = true)
 |-- sensor: string (nullable = true)



In [6]:
# Veri Adedi
print(df.count())

29882394


In [7]:
df.select("sensor", "room").groupBy("sensor").agg(F.count('room').alias("room")).show()

+-----------+-------+
|     sensor|   room|
+-----------+-------+
|      light|6571463|
|        co2|6574008|
|   humidity|6571465|
|temperature|6571505|
|        pir|3593953|
+-----------+-------+



In [8]:
df.select("sensor","room").groupBy("sensor").agg(F.count("room")).show()

+-----------+-----------+
|     sensor|count(room)|
+-----------+-----------+
|      light|    6571463|
|        co2|    6574008|
|   humidity|    6571465|
|temperature|    6571505|
|        pir|    3593953|
+-----------+-----------+



In [9]:
df.select(F.min("time"), F.max("time")).show(2)

+-------------------+-------------------+
|          min(time)|          max(time)|
+-------------------+-------------------+
|2013-08-23 15:00:00|2013-09-01 06:58:59|
+-------------------+-------------------+



In [10]:
print(df.select("room").distinct().count())
# 51

51


In [11]:
print(df.select("sensor").distinct().count())

5
