# Start Spark session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-test').getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/17 10:04:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Get data

In [2]:
import os
import wget

if not os.path.isfile('iris.data'):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    wget.download(url)

In [3]:
data = spark.read.csv(
    "iris.data", 
    inferSchema=True,
)
data.show(n=5)

                                                                                

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



# Filter operations
> Note the following notation:
> | Operator    | Description   |
> | :---:       |    :---       | 
> | ( ) \& ( )  |   and         |
> | ( ) \| ( )  |   or          |
> | ( ) == ( )  |   equals      |
> | ( ) <= ( )  |   leq         |
> | ( ) >= ( )  |   geq         |
> | ( ) < ( )   |   less than   |
> | ( ) > ( )   |   greater than|
> |   \~ ( )    |   not         |

In [15]:
# where _c1<3.0
data.filter('_c1<3.0').show(n=5)

+---+---+---+---+---------------+
|_c0|_c1|_c2|_c3|            _c4|
+---+---+---+---+---------------+
|4.4|2.9|1.4|0.2|    Iris-setosa|
|4.5|2.3|1.3|0.3|    Iris-setosa|
|5.5|2.3|4.0|1.3|Iris-versicolor|
|6.5|2.8|4.6|1.5|Iris-versicolor|
|5.7|2.8|4.5|1.3|Iris-versicolor|
+---+---+---+---+---------------+
only showing top 5 rows



In [16]:
# where _c1<3.0
data.filter(data['_c1']<3.0).show(n=5)

+---+---+---+---+---------------+
|_c0|_c1|_c2|_c3|            _c4|
+---+---+---+---+---------------+
|4.4|2.9|1.4|0.2|    Iris-setosa|
|4.5|2.3|1.3|0.3|    Iris-setosa|
|5.5|2.3|4.0|1.3|Iris-versicolor|
|6.5|2.8|4.6|1.5|Iris-versicolor|
|5.7|2.8|4.5|1.3|Iris-versicolor|
+---+---+---+---+---------------+
only showing top 5 rows



In [19]:
# where _c1<3.0 and # where _c2>5.1
data.filter(
    (data['_c1']<3.0) & \
    (data['_c2']>5.1)
    ).show(n=5)

+---+---+---+---+--------------+
|_c0|_c1|_c2|_c3|           _c4|
+---+---+---+---+--------------+
|6.3|2.9|5.6|1.8|Iris-virginica|
|7.3|2.9|6.3|1.8|Iris-virginica|
|6.7|2.5|5.8|1.8|Iris-virginica|
|6.4|2.7|5.3|1.9|Iris-virginica|
|7.7|2.6|6.9|2.3|Iris-virginica|
+---+---+---+---+--------------+
only showing top 5 rows

