# Data Wrangling with Spark

I use this notebook firts to make some knowledge about the dataset.

In [1]:
import findspark
findspark.init()

import pyspark

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql import functions as F

import datetime

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import os

In [3]:
####### Setting SparkSession:

spark = SparkSession \
            .builder \
            .appName("Wrangling Data") \
            .getOrCreate()

spark

## Read the dataset files:
|

|--> Item information

|

|--> Station information

|

|--> Measurement info

|

|--> Measurement Summary


In [4]:
measurement_item_file  = os.path.join('Resources', "Data_Measurement_item_info.csv")
measurement_station_file  = os.path.join('Resources', "Data_Measurement_station_info.csv")
measurement_info_file  = os.path.join('Resources', "Measurement_info.csv")
measurement_file  = os.path.join('Resources', "Measurement_summary.csv")

### Item dataset

In [5]:
item_info = spark.read.format("csv").option("header", "true").load(measurement_item_file)

In [6]:
item_info.printSchema()

root
 |-- Item code: string (nullable = true)
 |-- Item name: string (nullable = true)
 |-- Unit of measurement: string (nullable = true)
 |-- Good(Blue): string (nullable = true)
 |-- Normal(Green): string (nullable = true)
 |-- Bad(Yellow): string (nullable = true)
 |-- Very bad(Red): string (nullable = true)



### Station dataset

In [7]:
station_info = spark.read.format("csv").option("header", "true").load(measurement_station_file)
station_info.printSchema()

root
 |-- Station code: string (nullable = true)
 |-- Station name(district): string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)



### Info dataset

In [8]:
info = spark.read.format("csv").option("header", "true").load(measurement_info_file)
info.printSchema()

root
 |-- Measurement date: string (nullable = true)
 |-- Station code: string (nullable = true)
 |-- Item code: string (nullable = true)
 |-- Average value: string (nullable = true)
 |-- Instrument status: string (nullable = true)



### Measurament dataset

In [9]:
measurament = spark.read.format("csv").option("header", "true").load(measurement_file)
measurament.printSchema()

root
 |-- Measurement date: string (nullable = true)
 |-- Station code: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- SO2: string (nullable = true)
 |-- NO2: string (nullable = true)
 |-- O3: string (nullable = true)
 |-- CO: string (nullable = true)
 |-- PM10: string (nullable = true)
 |-- PM2.5: string (nullable = true)



In [13]:
measurament.show(5)

+----------------+------------+--------------------+----------+-----------+-----+-----+-----+---+----+-----+
|Measurement date|Station code|             Address|  Latitude|  Longitude|  SO2|  NO2|   O3| CO|PM10|PM2.5|
+----------------+------------+--------------------+----------+-----------+-----+-----+-----+---+----+-----+
|2017-01-01 00:00|         101|19, Jong-ro 35ga-...|37.5720164|127.0050075|0.004|0.059|0.002|1.2|73.0| 57.0|
|2017-01-01 01:00|         101|19, Jong-ro 35ga-...|37.5720164|127.0050075|0.004|0.058|0.002|1.2|71.0| 59.0|
|2017-01-01 02:00|         101|19, Jong-ro 35ga-...|37.5720164|127.0050075|0.004|0.056|0.002|1.2|70.0| 59.0|
|2017-01-01 03:00|         101|19, Jong-ro 35ga-...|37.5720164|127.0050075|0.004|0.056|0.002|1.2|70.0| 58.0|
|2017-01-01 04:00|         101|19, Jong-ro 35ga-...|37.5720164|127.0050075|0.003|0.051|0.002|1.2|69.0| 61.0|
+----------------+------------+--------------------+----------+-----------+-----+-----+-----+---+----+-----+
only showing top 5 

In [14]:
item_info.show()

+---------+---------+-------------------+----------+-------------+-----------+-------------+
|Item code|Item name|Unit of measurement|Good(Blue)|Normal(Green)|Bad(Yellow)|Very bad(Red)|
+---------+---------+-------------------+----------+-------------+-----------+-------------+
|        1|      SO2|                ppm|      0.02|         0.05|       0.15|          1.0|
|        3|      NO2|                ppm|      0.03|         0.06|        0.2|          2.0|
|        5|       CO|                ppm|       2.0|          9.0|       15.0|         50.0|
|        6|       O3|                ppm|      0.03|         0.09|       0.15|          0.5|
|        8|     PM10|      Mircrogram/m3|      30.0|         80.0|      150.0|        600.0|
|        9|    PM2.5|      Mircrogram/m3|      15.0|         35.0|       75.0|        500.0|
+---------+---------+-------------------+----------+-------------+-----------+-------------+



------------------------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------------------

### Verify The Color Level of each measurement info

In [79]:
# Convert item to pandas to use a function:
def getLevel(value,name):
    item_df = item_info.toPandas()
    good_blue = float(item_df.loc[item_df["Item name"]==name]["Good(Blue)"][0])
    normal_green = float(item_df.loc[item_df["Item name"]==name]["Normal(Green)"][0])
    
    if value < 

level_risk = udf(lambda x: )

item_df = item_info.toPandas()
item_df.head()

Unnamed: 0,Item code,Item name,Unit of measurement,Good(Blue),Normal(Green),Bad(Yellow),Very bad(Red)
0,1,SO2,ppm,0.02,0.05,0.15,1.0
1,3,NO2,ppm,0.03,0.06,0.2,2.0
2,5,CO,ppm,2.0,9.0,15.0,50.0
3,6,O3,ppm,0.03,0.09,0.15,0.5
4,8,PM10,Mircrogram/m3,30.0,80.0,150.0,600.0


In [55]:
recent_measurements = measurament.select(["SO2",F.regexp_extract('Measurement date',r'2017-01-01*', 0)])




In [80]:
item_df = item_info.toPandas()
item_df.head()

Unnamed: 0,Item code,Item name,Unit of measurement,Good(Blue),Normal(Green),Bad(Yellow),Very bad(Red)
0,1,SO2,ppm,0.02,0.05,0.15,1.0
1,3,NO2,ppm,0.03,0.06,0.2,2.0
2,5,CO,ppm,2.0,9.0,15.0,50.0
3,6,O3,ppm,0.03,0.09,0.15,0.5
4,8,PM10,Mircrogram/m3,30.0,80.0,150.0,600.0


In [85]:
float(item_df.loc[item_df["Item name"]=="SO2"]["Good(Blue)"][0])

0.02