## Filter Data 

By use DataFrame.filter(), to select a subset of rows from Dataframe on condintion defined

### Import Required modules and initialize SparkSession

In [1]:
from __future__ import print_function
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Filter Data").getOrCreate()

In [3]:
spark

### Create DataFrame with sample data

In [4]:
data = [
    ("Satish", "Kumar", 14, 80000.),
    ("Ramya", "Sree", 5, 50000.),
    ("Tejaswini", "Uppara", 3, 30000.),
    ("Bhavishya", "Uppara", 2, 20000.),
]

df = spark.createDataFrame(
    data, schema='first_name string, second_name string, experience int, salary double'
)

In [5]:
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- second_name: string (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: double (nullable = true)



In [6]:
df.show(truncate=False)

+----------+-----------+----------+-------+
|first_name|second_name|experience|salary |
+----------+-----------+----------+-------+
|Satish    |Kumar      |14        |80000.0|
|Ramya     |Sree       |5         |50000.0|
|Tejaswini |Uppara     |3         |30000.0|
|Bhavishya |Uppara     |2         |20000.0|
+----------+-----------+----------+-------+



### Filter Data

In [7]:
# Get the people who had experience more than 10 
df.filter("experience > 10").show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
|    Satish|      Kumar|        14|80000.0|
+----------+-----------+----------+-------+



In [8]:
### Get the people who had Salary of less than or equal to 30000
df.filter("salary<=30000").show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
| Tejaswini|     Uppara|         3|30000.0|
| Bhavishya|     Uppara|         2|20000.0|
+----------+-----------+----------+-------+



In [9]:
df.filter(df['salary'] <= 30000).show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
| Tejaswini|     Uppara|         3|30000.0|
| Bhavishya|     Uppara|         2|20000.0|
+----------+-----------+----------+-------+



In [10]:
df.filter(df.salary <= 30000).show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
| Tejaswini|     Uppara|         3|30000.0|
| Bhavishya|     Uppara|         2|20000.0|
+----------+-----------+----------+-------+



In [11]:
### Get the names of people who had Salary of less than or equal to 30000
df.filter("salary<=30000").select('first_name', 'second_name').show(truncate = False)

+----------+-----------+
|first_name|second_name|
+----------+-----------+
|Tejaswini |Uppara     |
|Bhavishya |Uppara     |
+----------+-----------+



#### Mutiliple Conditions

In [12]:
df.filter((df['salary']<=20000) | (df['salary']<=50000)).show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
|     Ramya|       Sree|         5|50000.0|
| Tejaswini|     Uppara|         3|30000.0|
| Bhavishya|     Uppara|         2|20000.0|
+----------+-----------+----------+-------+



In [13]:
df.filter("(salary <=20000) or (salary <=50000)").show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
|     Ramya|       Sree|         5|50000.0|
| Tejaswini|     Uppara|         3|30000.0|
| Bhavishya|     Uppara|         2|20000.0|
+----------+-----------+----------+-------+



In [14]:
df.filter(~(df['salary']<=20000)).show()

+----------+-----------+----------+-------+
|first_name|second_name|experience| salary|
+----------+-----------+----------+-------+
|    Satish|      Kumar|        14|80000.0|
|     Ramya|       Sree|         5|50000.0|
| Tejaswini|     Uppara|         3|30000.0|
+----------+-----------+----------+-------+

