# PySpark 101 

In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

## Spark Packages

In [None]:
from pyspark.sql import SparkSession

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('data/input/school-decile.csv', encoding='cp1252')
df.head(2)

Unnamed: 0,School Id,Org Name,Telephone,Fax,Email^,Principal*,URL,Physical Address Line1,Physical Address Suburb,Physical Address City,...,Isolation Index,Decile,Total,European,M?ori,Pacific,Asian,MELAA,Other,International
0,1,Te Kura o Te Kao,09 409 7813,,office@tekao.school.nz,Ng?waiata Evans (Acting),http://www.tekuraotekao.school.nz,6603 Far North Road,,Te Kao,...,4.18,2.0,36,0,36,0,0,0,0,0
1,2,Taipa Area School,09 406 0159,09 406 1096,office@taipa.school.nz,Doreen Bailey,http://www.taipa.school.nz,578 State Highway 10,,Taipa,...,2.88,2.0,300,34,246,11,8,0,1,0


## Create Spark Session : which establishes a connection to the spark core

In [3]:
spark = SparkSession.builder.getOrCreate()

## Read input csv file

In [4]:
spark_df = spark.read.csv(path = 'data/input/school-decile.csv', header = True, encoding='cp1252')

In [5]:
spark_df.show(2)

+---------+-----------------+-----------+-----------+--------------------+--------------------+--------------------+----------------------+-----------------------+---------------------+--------------------+---------------------+-------------------+--------------------------+----------------+---------+--------------------+---------+--------------------+--------------+---------------------+----------------+-----------------+----------------+------------------+----------------+------------------+------------+------+--------------------+----------+----------+---------------+------+-----+--------+-----+-------+-----+-----+-----+-------------+
|School Id|         Org Name|  Telephone|        Fax|              Email^|          Principal*|                 URL|Physical Address Line1|Physical Address Suburb|Physical Address City|Postal Address Line1|Postal Address Suburb|Postal Address City|Postal Address Postal Code|      Urban Area| Org Type|          Definition|Authority|    School Donations|

In [14]:
type(spark_df)

pyspark.sql.dataframe.DataFrame

## Create a view of the dataframe : Can sql query from the view

In [15]:
spark_df.createOrReplaceTempView('spark_df_view')

In [21]:
select_query = 'select * from spark_df_view limit 2'

In [22]:
query_result = spark.sql(select_query)
query_result.show()

+---------+-----------------+-----------+-----------+--------------------+--------------------+--------------------+----------------------+-----------------------+---------------------+--------------------+---------------------+-------------------+--------------------------+----------------+---------+--------------------+---------+--------------------+--------------+---------------------+----------------+-----------------+----------------+------------------+----------------+------------------+------------+------+--------------------+----------+----------+---------------+------+-----+--------+-----+-------+-----+-----+-----+-------------+
|School Id|         Org Name|  Telephone|        Fax|              Email^|          Principal*|                 URL|Physical Address Line1|Physical Address Suburb|Physical Address City|Postal Address Line1|Postal Address Suburb|Postal Address City|Postal Address Postal Code|      Urban Area| Org Type|          Definition|Authority|    School Donations|

## Row Filter dataframe

In [28]:
spark_df.filter(spark_df.Decile >  4).show()

+---------+--------------------+-----------+-----------+--------------------+-------------------+--------------------+----------------------+-----------------------+---------------------+--------------------+---------------------+-------------------+--------------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+----------------+-----------------+----------------+------------------+----------------+--------------------+--------------------+------+--------------------+----------+----------+---------------+------+-----+--------+-----+-------+-----+-----+-----+-------------+
|School Id|            Org Name|  Telephone|        Fax|              Email^|         Principal*|                 URL|Physical Address Line1|Physical Address Suburb|Physical Address City|Postal Address Line1|Postal Address Suburb|Postal Address City|Postal Address Postal Code|      Urban Area|            Org Type

## Column Filter Dataframe 

In [30]:
spark_df.select(['Telephone']).show()

+-----------+
|  Telephone|
+-----------+
|09 409 7813|
|09 406 0159|
|09 408 0190|
|09 405 0199|
|09 407 8916|
|09 409 5878|
|09 401 9030|
|09 404 1055|
|09 401 3200|
|09 409 5701|
|09 405 8500|
|09 432 2643|
|09 435 1688|
|09 437 3299|
|09 430 4170|
|09 430 4460|
|09 438 3950|
|09 433 1702|
|09 439 7229|
|09 432 8226|
+-----------+
only showing top 20 rows



## Mutliple parameterised filter

In [38]:
isolation_filter = spark_df['Isolation Index'] > 2
decile_filter = spark_df['Decile'] > 1

In [43]:
spark_df.count()

2556

In [42]:
spark_df.filter(isolation_filter).filter(decile_filter).count()

98