<a href="https://colab.research.google.com/github/saurater/ciencia_de_dados_pyspark/blob/main/PySpark_Tutorial_Part_4_Dataset_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark - Tutorial - Part 4 - Dataset Filtering
Notebook by Sam Faraday
June 2022

1. Filtering by a Spcecific Column
2. Filtering by a Specific Column and Selecting Specific Columns
3. Filtering with multiple conditions



Sources:
Free Code Camp: PySpark Tutorial at https://www.youtube.com/watch?v=_C8kWso4ne4
Apache Spark API Refernce at https://spark.apache.org/docs/latest/api/python/reference/index.html

# 1. Installing PySpark

In [40]:
pip install pyspark # run it every time you connect to Google Colab Notebook

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 2. Importing the required libraries

In [41]:
from pyspark.sql.functions import col,isnan,when,count

In [42]:
import pandas as pd

In [43]:
import numpy as np

# 3. Creating the Test4 Dataset

In [44]:
data = {'Index':[1,2,3,4,5,6,np.NaN], 'Name':['Tom', 'Nick', 'Krish', '','Jack',  '',''], 'Age':[20, np.NaN, np.NaN, 19, 18,19, np.NaN], 'Salary':[2000, 3000, np.NaN, 4000, 3000, 3500, np.NaN] }
# Create DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Index,Name,Age,Salary
0,1.0,Tom,20.0,2000.0
1,2.0,Nick,,3000.0
2,3.0,Krish,,
3,4.0,,19.0,4000.0
4,5.0,Jack,18.0,3000.0
5,6.0,,19.0,3500.0
6,,,,


# 4. Saving the Dataset

In [45]:
df.to_csv('test4.csv', index=False)

# 5. Initializing PySpark


In [46]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Dataframe").getOrCreate()

spark

# 6. Reading the Dataset

In [47]:
df_spark = spark.read.csv("test4.csv", header =True, inferSchema =True)
df_spark.show()

+-----+-----+----+------+
|Index| Name| Age|Salary|
+-----+-----+----+------+
|  1.0|  Tom|20.0|2000.0|
|  2.0| Nick|null|3000.0|
|  3.0|Krish|null|  null|
|  4.0| null|19.0|4000.0|
|  5.0| Jack|18.0|3000.0|
|  6.0| null|19.0|3500.0|
| null| null|null|  null|
+-----+-----+----+------+



# 7. Checking the Schema

In [48]:
df_spark.printSchema()

root
 |-- Index: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Salary: double (nullable = true)



# 8. Find count for empty, None, Null, Nan with string literals.

df2 = df.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df.columns])
df2.show()

In [49]:
df_spark2 = df_spark.select([count(when(col(c).contains('None') |
col(c).contains('NULL') |
(col(c) == '' ) |
col(c).isNull() |
isnan(c), c )).alias(c) for c in df_spark.columns]) 

df_spark2.show()

+-----+----+---+------+
|Index|Name|Age|Salary|
+-----+----+---+------+
|    1|   3|  3|     2|
+-----+----+---+------+



# 9. Filtering by a Spcecific Column
Equal == 
Not Equal !=
Greater or Equal >=
Less or Equal <=

In [50]:
df_spark.show()

+-----+-----+----+------+
|Index| Name| Age|Salary|
+-----+-----+----+------+
|  1.0|  Tom|20.0|2000.0|
|  2.0| Nick|null|3000.0|
|  3.0|Krish|null|  null|
|  4.0| null|19.0|4000.0|
|  5.0| Jack|18.0|3000.0|
|  6.0| null|19.0|3500.0|
| null| null|null|  null|
+-----+-----+----+------+



## Filter Exemple 1
Please note here there is just one equal (=) sign

In [51]:
df_spark.filter('Age=18').show()

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  5.0|Jack|18.0|3000.0|
+-----+----+----+------+



In [68]:
df_spark.filter('Age>18').show()

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  1.0| Tom|20.0|2000.0|
|  4.0|null|19.0|4000.0|
|  6.0|null|19.0|3500.0|
+-----+----+----+------+



## Filter Exemple 2
Please note here there are just two equal (==) signs

In [67]:
df_spark.filter(df_spark['Age']==18).show()

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  5.0|Jack|18.0|3000.0|
+-----+----+----+------+



In [69]:
df_spark.filter(df_spark['Age']>18).show()

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  1.0| Tom|20.0|2000.0|
|  4.0|null|19.0|4000.0|
|  6.0|null|19.0|3500.0|
+-----+----+----+------+



# 10. Filtering by a Specific Column and Selecting Specific Columns

In [52]:
df_spark.filter('Age<20').select(["Name", "Age"]).show()

+----+----+
|Name| Age|
+----+----+
|null|19.0|
|Jack|18.0|
|null|19.0|
+----+----+



# 11. Filtering with multiple conditions 

== Equal

!= not equal to 

~ = not

& = and

| = or

In [79]:
df_spark.filter( (df_spark['Age']>=18) &  (df_spark['Name']=='Jack') ).show()
#age>18 and name=Jack

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  5.0|Jack|18.0|3000.0|
+-----+----+----+------+



In [80]:
df_spark.filter( (df_spark['Age']>=18) |  (df_spark['Name']=='Jack') ).show()
# age=18 or name=Jack

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  1.0| Tom|20.0|2000.0|
|  4.0|null|19.0|4000.0|
|  5.0|Jack|18.0|3000.0|
|  6.0|null|19.0|3500.0|
+-----+----+----+------+



In [97]:
df_spark.filter(~(df_spark['Age']<20)  ).show()
# everything not equal to age less than 20

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  1.0| Tom|20.0|2000.0|
+-----+----+----+------+



In [84]:
df_spark.filter((df_spark['Age']!=20)  ).show()
# age=18 or name=Jack

+-----+----+----+------+
|Index|Name| Age|Salary|
+-----+----+----+------+
|  4.0|null|19.0|4000.0|
|  5.0|Jack|18.0|3000.0|
|  6.0|null|19.0|3500.0|
+-----+----+----+------+

