#### Filter operations in Pyspark
- &, |, ==, < , >, >=, <=, ~

In [1]:
%load_ext watermark
%load_ext lab_black

In [2]:
from pyspark.sql import SparkSession

In [3]:
# start session
spark = SparkSession.builder.appName("Learning-spark").getOrCreate()

In [4]:
# read the dataset
df_pyspark = spark.read.csv("datasets/tips.csv", header=True, inferSchema=True)
df_pyspark.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



#### Some filter operations 

In [5]:
# filter based on only one operation
df_pyspark.filter("tip<=3").show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [13]:
# filter for specific columns
df_pyspark.filter("tip<=3").select(["sex", "total_bill"]).show(5)

+------+----------+
|   sex|total_bill|
+------+----------+
|Female|     16.99|
|  Male|     10.34|
|  Male|      8.77|
|  Male|     15.04|
|  Male|     10.27|
+------+----------+
only showing top 5 rows



In [15]:
# different approach of filtering
df_pyspark.filter(df_pyspark["tip"] <= 3).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [16]:
# filtering by providing range
df_pyspark.filter((df_pyspark["tip"] <= 3) & (df_pyspark["tip"] >= 2)).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     20.29|2.75|Female|    No|Sat|Dinner|   2|
|     15.77|2.23|Female|    No|Sat|Dinner|   2|
|     17.81|2.34|  Male|    No|Sat|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [17]:
# filtering by providing range
df_pyspark.filter((df_pyspark["tip"] <= 3) | (df_pyspark["tip"] >= 2)).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [18]:
# filtering with not operation
df_pyspark.filter(~(df_pyspark["tip"] <= 3)).show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [20]:
# filtering with exact value
df_pyspark.filter(df_pyspark["tip"] == 3).show()

+----------+---+------+------+----+------+----+
|total_bill|tip|   sex|smoker| day|  time|size|
+----------+---+------+------+----+------+----+
|     18.43|3.0|  Male|    No| Sun|Dinner|   4|
|     19.65|3.0|Female|    No| Sat|Dinner|   2|
|     15.06|3.0|Female|    No| Sat|Dinner|   2|
|     18.29|3.0|  Male|    No| Sun|Dinner|   2|
|     18.04|3.0|  Male|    No| Sun|Dinner|   2|
|     38.01|3.0|  Male|   Yes| Sat|Dinner|   4|
|     17.07|3.0|Female|    No| Sat|Dinner|   3|
|     22.76|3.0|  Male|    No|Thur| Lunch|   2|
|     19.44|3.0|  Male|   Yes|Thur| Lunch|   2|
|     21.16|3.0|  Male|    No|Thur| Lunch|   2|
|     28.97|3.0|  Male|   Yes| Fri|Dinner|   2|
|     21.01|3.0|  Male|   Yes| Fri|Dinner|   2|
|     15.38|3.0|Female|   Yes| Fri|Dinner|   2|
|      14.0|3.0|  Male|    No| Sat|Dinner|   2|
|     17.51|3.0|Female|   Yes| Sun|Dinner|   2|
|     40.55|3.0|  Male|   Yes| Sun|Dinner|   2|
|     38.73|3.0|  Male|   Yes| Sat|Dinner|   4|
|     28.15|3.0|  Male|   Yes| Sat|Dinne

We can play around with different filter opeations and analyze the result.