In [1]:
import findspark

findspark.init()

import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.sql('''select 'spark' as hello ''')
df.show()


+-----+
|hello|
+-----+
|spark|
+-----+



# Creating a Session

In [2]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
# May take awhile locally
spark = SparkSession.builder.appName("PySpark").getOrCreate()
spark

# Creating Spark Dataframes

In [3]:
# list of lists (same as in python)
data = [['tom', 10], ['nick', 15], ['juli', 14]] 
  
# Create the pandas DataFrame 

df = spark.createDataFrame(data, ['Name', 'Age'])

df

DataFrame[Name: string, Age: bigint]

In [4]:
df.columns

['Name', 'Age']

# Loading CSV Data

In [6]:
path = "students.csv"
df = spark.read.csv(path,header=True)
df.toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [7]:
df.count()

1000

In [8]:
df.show()

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [11]:
df.groupBy("gender").agg({'math score':'mean', 'reading score':'mean'}).show() # just one agg func per time

+------+------------------+------------------+
|gender|avg(reading score)|   avg(math score)|
+------+------------------+------------------+
|female| 72.60810810810811|63.633204633204635|
|  male| 65.47302904564316| 68.72821576763485|
+------+------------------+------------------+



In [12]:
from pyspark.sql import functions as F

df.groupBy("gender").agg(F.min("math score"), F.max("math score"), F.avg("math score")).show() # more than one agg func

+------+---------------+---------------+------------------+
|gender|min(math score)|max(math score)|   avg(math score)|
+------+---------------+---------------+------------------+
|female|              0|             99|63.633204633204635|
|  male|            100|             99| 68.72821576763485|
+------+---------------+---------------+------------------+




Spark DataFrame's are built on top of RDDs which are immutable in nature, hence Data frames are immutable in nature as well.

So if you make a change to a dataframe like adding a column or changing any of the values in the dataframe using the same naming convention, it will generate a new dataframe (with a new unique ID) instead of updating the existing data frame.

In [13]:
# Let's fetch the id of our dataframe we created above
df.rdd.id()

59

In [14]:
# Even if we duplicate the dataframe, the ID remains the same
df2 = df
df2.rdd.id()

59

In [15]:
df = df.withColumn('new_col', df['math score'] * 2)
df.rdd.id()

63

In [16]:
collect = df.collect()

In [25]:
collect[0]

Row(gender='female', race/ethnicity='group B', parental level of education="bachelor's degree", lunch='standard', test preparation course='none', math score='72', reading score='72', writing score='74', new_col=144.0)

Spark's Lazy Comuptation

What does that mean exactly?

As the name itself indicates its definition, lazy evaluation in Spark means that the execution will not start until it absolutuley HAS to. 